Exemplo n.º 1
0
    def test_seed_produces_same_conformers(self):
        import rdkit.Chem
        from rdkit.Chem import AllChem
        from e3fp.conformer.util import (
            mol_from_smiles,
            mol_to_standardised_mol,
        )
        from e3fp.conformer.generate import generate_conformers

        ntrials = 10
        confgen_params = {'num_conf': 1, 'seed': 42}
        smiles = "C" * 20  # long flexible molecule
        mol = mol_from_smiles(smiles, "tmp")
        mols = [
            generate_conformers(mol, **confgen_params)[0]
            for i in range(ntrials)
        ]

        fail = False
        for i in range(ntrials):
            for j in range(i + 1, ntrials):
                rms = AllChem.GetBestRMS(mols[i], mols[j])
                if rms > 1e-2:
                    fail = True
                    break
        self.assertFalse(fail)
Exemplo n.º 2
0
def run(smiles_file,
        bits=1024,
        radius=2,
        use_chiral=False,
        out_file="molecules.csv.bz2",
        log=None):
    setup_logging(log)

    smiles_dict = smiles_to_dict(smiles_file)
    mol_list_dict = {}
    for name, smiles in smiles_dict.iteritems():
        try:
            mol = mol_from_smiles(smiles, name)
            logging.info("Generating fingerprint for {}".format(name))
            fp = fprint2d_from_mol(mol,
                                   bits=bits,
                                   radius=radius,
                                   use_chiral=use_chiral)
            logging.info("Generated fingerprint for {}".format(name))
            mol_list_dict.setdefault(name,
                                     []).append(fprint_to_native_tuple(fp))

        except Exception:
            logging.warning("Fingerprinting {} failed.".format(name))
    fp_type = get_fprint2d_fptype(bits=bits,
                                  radius=radius,
                                  use_chiral=use_chiral)
    lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type)
Exemplo n.º 3
0
def get_canonical_smiles(smiles_dict):
    canonical_smiles_dict = {}
    for i, (mol_name, smiles) in enumerate(smiles_dict.items()):
        try:
            mol = mol_from_smiles(smiles, mol_name, standardise=True)
            canon_smiles = MolToSmiles(mol, isomericSmiles=True)
            canonical_smiles_dict[mol_name] = canon_smiles
        except:
            canonical_smiles_dict[mol_name] = smiles
    return canonical_smiles_dict
Exemplo n.º 4
0
def main(sdf_dir,
         smiles_file,
         num_mols=10000,
         first=3,
         out_props_file="random_mols_props.txt",
         out_smiles_file="random_mols.csv.bz2"):
    mol_names = set()
    if os.path.isfile(out_smiles_file):
        logging.info("Loading existing random molecules.")
        smiles_dict = smiles_to_dict(out_smiles_file)
        mol_names.update(set(smiles_dict))
        out_sdf_files_dict = {k: get_sdf_file(sdf_dir, k) for k in mol_names}
    else:
        logging.info("Loading SMILES file.")
        smiles_dict = smiles_to_dict(smiles_file)
        remaining_mol_names = set(smiles_dict.keys())
        out_smiles_dict = {}
        out_sdf_files_dict = {}
        logging.info("Picking random molecules.")
        while len(mol_names) < num_mols:
            print(len(mol_names))
            proto_name = random.choice(smiles_dict.keys())
            if proto_name not in remaining_mol_names:
                continue
            remaining_mol_names.remove(proto_name)
            sdf_file = get_sdf_file(sdf_dir, proto_name)
            if not os.path.isfile(sdf_file):
                continue
            mol_names.add(proto_name)
            out_smiles_dict[proto_name] = smiles_dict[proto_name]
            out_sdf_files_dict[proto_name] = sdf_file

            if len(mol_names) % 100 == 0:
                logging.info(len(mol_names))

        dict_to_smiles(out_smiles_file, out_smiles_dict)

    mol_names = sorted(mol_names)

    logging.info("Computing mol properties.")
    mol_props = {}
    for name, smiles in smiles_dict.items():
        mol = mol_from_smiles(smiles, name)
        nheavy = mol.GetNumHeavyAtoms()
        nrot = AllChem.CalcNumRotatableBonds(mol)
        mol_props[name] = (nheavy, nrot)

    with open(out_props_file, "w") as f:
        f.write("mol_name\tnheavy\tnrot\n")
        for mol_name in mol_names:
            nheavy, nrot = mol_props[mol_name]
            f.write("{}\t{:d}\t{:d}\n".format(mol_name, nheavy, nrot))
Exemplo n.º 5
0
    def test_standardisation(self):
        import rdkit.Chem
        from e3fp.conformer.util import (
            mol_from_smiles,
            mol_to_standardised_mol,
        )

        smiles = "C[N-]c1cccc[n+]1C"
        mol = mol_from_smiles(smiles, "tmp")
        self.assertEqual(rdkit.Chem.MolToSmiles(mol), smiles)

        mol = mol_to_standardised_mol(mol)
        self.assertEqual(rdkit.Chem.MolToSmiles(mol), "CN=c1ccccn1C")
Exemplo n.º 6
0
def run(
    mol2=None,
    smiles=None,
    standardise=STANDARDISE_DEF,
    num_conf=NUM_CONF_DEF,
    first=FIRST_DEF,
    pool_multiplier=POOL_MULTIPLIER_DEF,
    rmsd_cutoff=RMSD_CUTOFF_DEF,
    max_energy_diff=MAX_ENERGY_DIFF_DEF,
    forcefield=FORCEFIELD_DEF,
    seed=SEED_DEF,
    params=None,
    prioritize=False,
    out_dir=OUTDIR_DEF,
    compress=COMPRESS_DEF,
    overwrite=False,
    values_file=None,
    log=None,
    num_proc=None,
    parallel_mode=None,
    verbose=False,
):
    """Run conformer generation."""
    setup_logging(log, verbose=verbose)

    if params is not None:
        params = read_params(params)
        standardise = get_value(params, "preprocessing", "standardise", bool)
        num_conf = get_value(params, "conformer_generation", "num_conf", int)
        first = get_value(params, "conformer_generation", "first", int)
        pool_multiplier = get_value(params, "conformer_generation",
                                    "pool_multiplier", int)
        rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff",
                                float)
        max_energy_diff = get_value(params, "conformer_generation",
                                    "max_energy_diff", float)
        forcefield = get_value(params, "conformer_generation", "forcefield")
        seed = get_value(params, "conformer_generation", "seed", int)

    # check args
    if forcefield not in FORCEFIELD_CHOICES:
        raise ValueError(
            "Specified forcefield {} is not in valid options {!r}".format(
                forcefield, FORCEFIELD_CHOICES))

    para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode)

    # Check to make sure args make sense
    if mol2 is None and smiles is None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide mol2 file or a SMILES file.")
        sys.exit()

    if mol2 is not None and smiles is not None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide only a mol2 file OR a SMILES file.")
        sys.exit()

    if num_proc and num_proc < 1:
        if para.is_master():
            parser.print_usage()
            logging.error(
                "Please provide more than one processor with `--num_proc`.")
        sys.exit()

    # Set up input type
    if mol2 is not None:
        in_type = "mol2"
    elif smiles is not None:
        in_type = "smiles"

    if para.is_master():
        if in_type == "mol2":
            logging.info("Input type: mol2 file(s)")
            logging.info("Input file number: {:d}".format(len(mol2)))
            mol_iter = (mol_from_mol2(_mol2_file,
                                      _name,
                                      standardise=standardise)
                        for _mol2_file, _name in mol2_generator(*mol2))
        else:
            logging.info("Input type: Detected SMILES file(s)")
            logging.info("Input file number: {:d}".format(len(smiles)))
            mol_iter = (mol_from_smiles(_smiles,
                                        _name,
                                        standardise=standardise)
                        for _smiles, _name in smiles_generator(*smiles))

        if prioritize:
            logging.info(("Prioritizing mols with low rotatable bond number"
                          " and molecular weight first."))
            mols_with_properties = [(
                AllChem.CalcNumRotatableBonds(mol),
                AllChem.CalcExactMolWt(mol),
                mol,
            ) for mol in mol_iter if mol is not None]
            data_iterator = make_data_iterator(
                (x[-1] for x in sorted(mols_with_properties)))
        else:
            data_iterator = make_data_iterator(
                (x for x in mol_iter if x is not None))

        # Set up parallel-specific options
        logging.info("Parallel Type: {}".format(para.parallel_mode))

        # Set other options
        touch_dir(out_dir)

        if not num_conf:
            num_conf = -1

        logging.info("Out Directory: {}".format(out_dir))
        logging.info("Overwrite Existing Files: {}".format(overwrite))
        if values_file is not None:
            if os.path.exists(values_file) and overwrite is not True:
                value_args = (values_file, "a")
                logging.info("Values file: {} (append)".format((values_file)))
            else:
                value_args = (values_file, "w")
                logging.info("Values file: {} (new file)".format(
                    (values_file)))
        if num_conf is None or num_conf == -1:
            logging.info("Target Conformer Number: auto")
        else:
            logging.info("Target Conformer Number: {:d}".format(num_conf))
        if first is None or first == -1:
            logging.info("First Conformers Number: all")
        else:
            logging.info("First Conformers Number: {:d}".format(first))
        logging.info("Pool Multiplier: {:d}".format(pool_multiplier))
        logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff))
        if max_energy_diff is None:
            logging.info("Maximum Energy Difference: None")
        else:
            logging.info("Maximum Energy Difference: {:.4g} kcal".format(
                max_energy_diff))
        logging.info("Forcefield: {}".format(forcefield.upper()))
        if seed != -1:
            logging.info("Seed: {:d}".format(seed))

        logging.info("Starting.")
    else:
        data_iterator = iter([])

    gen_conf_kwargs = {
        "out_dir": out_dir,
        "num_conf": num_conf,
        "rmsd_cutoff": rmsd_cutoff,
        "max_energy_diff": max_energy_diff,
        "forcefield": forcefield,
        "pool_multiplier": pool_multiplier,
        "first": first,
        "seed": seed,
        "save": True,
        "overwrite": overwrite,
        "compress": compress,
    }

    run_kwargs = {"kwargs": gen_conf_kwargs}

    results_iterator = para.run_gen(generate_conformers, data_iterator,
                                    **run_kwargs)

    if para.is_master() and values_file is not None:
        hdf5_buffer = HDF5Buffer(*value_args)

    for result, data in results_iterator:
        if (para.is_master() and values_file is not None
                and result is not False):
            values_to_hdf5(hdf5_buffer, result)

    if para.is_master() and values_file is not None:
        hdf5_buffer.flush()
        hdf5_buffer.close()