def main(smiles_file, params_file, sdf_dir=None, out_file="molecules.csv.bz2", log=None, num_proc=None, parallel_mode=None, verbose=False): """Fingerprint molecules.""" setup_logging(log, verbose=verbose) parallelizer = Parallelizer(parallel_mode="processes") # set conformer generation and fingerprinting parameters confgen_params, fprint_params = params_to_dicts(params_file) kwargs = {"save": False, "fprint_params": fprint_params} smiles_dict = smiles_to_dict(smiles_file) mol_num = len({x.split('-')[0] for x in smiles_dict}) if sdf_dir is not None: sdf_files = glob.glob(os.path.join(sdf_dir, "*.sdf*")) sdf_files = sorted( [x for x in sdf_files if name_from_sdf_filename(x) in smiles_dict]) data_iter = make_data_iterator(sdf_files) fp_method = native_tuples_from_sdf logging.info("Using SDF files from {}".format(sdf_dir)) else: kwargs["confgen_params"] = confgen_params data_iter = ((smiles, name) for name, smiles in smiles_dict.iteritems()) mol_num = len({x.split('-')[0] for x in smiles_dict}) fp_method = native_tuples_from_smiles logging.info("Will generate conformers.") logging.info( "Conformer generation params: {!r}.".format(confgen_params)) logging.info("Fingerprinting params: {!r}.".format(fprint_params)) # fingerprint in parallel logging.info("Fingerprinting {:d} molecules".format(mol_num)) mol_list_dict = {} for result, data in parallelizer.run_gen(fp_method, data_iter, kwargs=kwargs): if not result: logging.warning("Fingerprinting failed for {}.".format(data[0])) continue try: _, name = result[0] name = name.split('_')[0] except IndexError: logging.warning("Fingerprinting failed for {}.".format(data[0])) continue mol_list_dict[name] = result logging.info("Finished fingerprinting molecules") # save to SEA molecules file logging.info("Saving fingerprints to {}".format(out_file)) fp_type = fprint_params_to_fptype(**fprint_params) lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type) logging.info("Finished!")
def main(mfile1, mfile2, name1, name2, out_file, precision=PRECISION, log_freq=LOG_FREQ, num_proc=None, parallel_mode=None): setup_logging() if not out_file: out_file = (name1.lower().replace('\s', '_') + "_" + name2.lower().replace('\s', '_') + "_tcs.csv.gz") # Load files mmap1 = load_mmap(mfile1) mmap2 = load_mmap(mfile2) if mmap1.shape != mmap2.shape: raise ValueError( "Memmaps do not have the same shape: {} {}".format( mmap1.shape, mmap2.shape)) # Count binned pairs pair_num = mmap1.shape[0] del mmap1, mmap2 para = Parallelizer(parallel_mode=parallel_mode, num_proc=num_proc) num_proc = max(para.num_proc - 1, 1) chunk_bounds = np.linspace(-1, pair_num - 1, num_proc + 1, dtype=int) chunk_bounds = list(zip(chunk_bounds[:-1] + 1, chunk_bounds[1:])) logging.info("Divided into {} chunks with ranges: {}".format(num_proc, chunk_bounds)) logging.info("Counting TCs in chunks.") kwargs = {"mfile1": mfile1, "mfile2": mfile2, "precision": precision, "log_freq": log_freq} results_iter = para.run_gen(count_tcs, chunk_bounds, kwargs=kwargs) tc_pair_counts = Counter() for chunk_counts, _ in results_iter: if not isinstance(chunk_counts, dict): logging.error("Results are not in dict form.") continue tc_pair_counts.update(chunk_counts) # Write pairs to file logging.info("Writing binned pairs to {}.".format(out_file)) mult = 10**precision with smart_open(out_file, "wb") as f: writer = csv.writer(f, delimiter=SEP) writer.writerow([name1, name2, "Count"]) for pair in sorted(tc_pair_counts): writer.writerow([round(pair[0] / mult, precision), round(pair[1] / mult, precision), tc_pair_counts[pair]]) total_counts = sum(tc_pair_counts.values()) if total_counts != pair_num: logging.warning( "Pair counts {} did not match expected number {}".format( total_counts, pair_num)) return logging.info("Completed.")
def main(smiles_file, sdf_dir, out_file): _, fprint_params = params_to_dicts(load_params()) smiles_dict = smiles_to_dict(smiles_file) para = Parallelizer() smiles_iter = ((smiles, get_sdf_file(name, sdf_dir), name) for name, smiles in smiles_dict.items()) kwargs = {"fprint_params": fprint_params} results_iter = para.run_gen(benchmark_fprinting, smiles_iter, kwargs=kwargs) with open(out_file, "w") as f: f.write("\t".join([ "Name", "ECFP4 Time", "E3FP Time", "Num Heavy", "Num Confs", "Num Rot" ]) + "\n") for results, (_, _, name) in results_iter: print(results) f.write("{}\t{:.4g}\t{:.4g}\t{:d}\t{:d}\t{:d}\n".format( name, *results))
def run(sdf_files, bits=BITS, first=FIRST_DEF, level=LEVEL_DEF, radius_multiplier=RADIUS_MULTIPLIER_DEF, counts=COUNTS_DEF, stereo=STEREO_DEF, include_disconnected=INCLUDE_DISCONNECTED_DEF, rdkit_invariants=RDKIT_INVARIANTS_DEF, exclude_floating=EXCLUDE_FLOATING_DEF, params=None, out_dir_base=None, out_ext=OUT_EXT_DEF, db_file=None, overwrite=False, all_iters=False, log=None, num_proc=None, parallel_mode=None, verbose=False): """Generate E3FP fingerprints from SDF files.""" setup_logging(log, verbose=verbose) if params is not None: params = read_params(params, fill_defaults=True) bits = get_value(params, "fingerprinting", "bits", int) first = get_value(params, "fingerprinting", "first", int) level = get_value(params, "fingerprinting", "level", int) radius_multiplier = get_value(params, "fingerprinting", "radius_multiplier", float) counts = get_value(params, "fingerprinting", "counts", bool) stereo = get_value(params, "fingerprinting", "stereo", bool) include_disconnected = get_value(params, "fingerprinting", "include_disconnected", bool) rdkit_invariants = get_value(params, "fingerprinting", "rdkit_invariants", bool) exclude_floating = get_value(params, "fingerprinting", "exclude_floating", bool) para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode) if para.rank == 0: logging.info("Initializing E3FP generation.") logging.info("Getting SDF files") if len(sdf_files) == 1 and os.path.isdir(sdf_files[0]): from glob import glob sdf_files = glob("{:s}/*sdf*".format(sdf_files[0])) data_iterator = make_data_iterator(sdf_files) logging.info("SDF File Number: {:d}".format(len(sdf_files))) if out_dir_base is not None: logging.info("Out Directory Basename: {:s}".format(out_dir_base)) logging.info("Out Extension: {:s}".format(out_ext)) if db_file is not None: logging.info("Database File: {:s}".format(db_file)) if db_file is None and out_dir_base is None: sys.exit('Either `db_file` or `out_dir_base` must be specified.') logging.info("Max First Conformers: {:d}".format(first)) logging.info("Bits: {:d}".format(bits)) logging.info("Level/Max Iterations: {:d}".format(level)) logging.info( "Shell Radius Multiplier: {:.4g}".format(radius_multiplier)) logging.info("Stereo Mode: {!s}".format(stereo)) if include_disconnected: logging.info("Connected-only mode: on") if rdkit_invariants: logging.info("Invariant type: RDKit") else: logging.info("Invariant type: Daylight") logging.info("Parallel Mode: {!s}".format(para.parallel_mode)) logging.info("Starting") else: data_iterator = iter([]) fp_kwargs = { "first": first, "bits": bits, "level": level, "radius_multiplier": radius_multiplier, "stereo": stereo, "counts": counts, "include_disconnected": include_disconnected, "rdkit_invariants": rdkit_invariants, "exclude_floating": exclude_floating, "out_dir_base": out_dir_base, "out_ext": out_ext, "all_iters": all_iters, "overwrite": overwrite, "save": False } if out_dir_base is not None: fp_kwargs['save'] = True run_kwargs = {"kwargs": fp_kwargs} results_iter = para.run_gen(fprints_dict_from_sdf, data_iterator, **run_kwargs) if db_file is not None: fprints = [] for result, data in results_iter: try: fprints.extend(result.get(level, result[max(result.keys())])) except (AttributeError, ValueError): # fprinting failed, assume logged in method continue if len(fprints) > 0: db = FingerprintDatabase(fp_type=type(fprints[0]), level=level) db.add_fingerprints(fprints) db.save(db_file) logging.info("Saved fingerprints to {:s}".format(db_file)) else: list(results_iter)
if para.rank == 0: logging.info( "Found total of {} mols. Selecting {} for comparison.".format( len(mutual_mols), mol_num)) mols = sorted(np.random.choice(mutual_mols, size=mol_num, replace=False)) pairs = ((i, j) for i in xrange(mol_num) for j in xrange(i + 1, mol_num)) pair_groups_iter = split_iterator_into_chunks(pairs, max_num=MAX_CHUNK_SIZE) pairs_iter = ((x, ) for x in pair_groups_iter) kwargs = { "mol_names": mols, "fp_sets_dict1": ecfp_fp_sets, "fp_sets_dict2": e3fp_fp_sets, "col_prefixes": ["ECFP4", "E3FP Max"] } results_iter = para.run_gen(compute_tc_pairs, pairs_iter, kwargs=kwargs) if para.rank == 0: logging.info("Computing TC pairs.") counts_df = None examples_df = None pair_num_tot = mol_num * (mol_num - 1) / 2. pair_num_running = 0 cache_point = int(CACHE_FREQ * pair_num_tot) i = 0 for (counts_file, examples_file), _ in results_iter: pair_num_running += len(_[0]) if para.rank == 0: logging.info("{:.2f}% completed".format(100 * pair_num_running / float(pair_num_tot))) with open(counts_file, "r") as f:
def run( mol2=None, smiles=None, standardise=STANDARDISE_DEF, num_conf=NUM_CONF_DEF, first=FIRST_DEF, pool_multiplier=POOL_MULTIPLIER_DEF, rmsd_cutoff=RMSD_CUTOFF_DEF, max_energy_diff=MAX_ENERGY_DIFF_DEF, forcefield=FORCEFIELD_DEF, seed=SEED_DEF, params=None, prioritize=False, out_dir=OUTDIR_DEF, compress=COMPRESS_DEF, overwrite=False, values_file=None, log=None, num_proc=None, parallel_mode=None, verbose=False, ): """Run conformer generation.""" setup_logging(log, verbose=verbose) if params is not None: params = read_params(params) standardise = get_value(params, "preprocessing", "standardise", bool) num_conf = get_value(params, "conformer_generation", "num_conf", int) first = get_value(params, "conformer_generation", "first", int) pool_multiplier = get_value(params, "conformer_generation", "pool_multiplier", int) rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff", float) max_energy_diff = get_value(params, "conformer_generation", "max_energy_diff", float) forcefield = get_value(params, "conformer_generation", "forcefield") seed = get_value(params, "conformer_generation", "seed", int) # check args if forcefield not in FORCEFIELD_CHOICES: raise ValueError( "Specified forcefield {} is not in valid options {!r}".format( forcefield, FORCEFIELD_CHOICES)) para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode) # Check to make sure args make sense if mol2 is None and smiles is None: if para.is_master(): parser.print_usage() logging.error("Please provide mol2 file or a SMILES file.") sys.exit() if mol2 is not None and smiles is not None: if para.is_master(): parser.print_usage() logging.error("Please provide only a mol2 file OR a SMILES file.") sys.exit() if num_proc and num_proc < 1: if para.is_master(): parser.print_usage() logging.error( "Please provide more than one processor with `--num_proc`.") sys.exit() # Set up input type if mol2 is not None: in_type = "mol2" elif smiles is not None: in_type = "smiles" if para.is_master(): if in_type == "mol2": logging.info("Input type: mol2 file(s)") logging.info("Input file number: {:d}".format(len(mol2))) mol_iter = (mol_from_mol2(_mol2_file, _name, standardise=standardise) for _mol2_file, _name in mol2_generator(*mol2)) else: logging.info("Input type: Detected SMILES file(s)") logging.info("Input file number: {:d}".format(len(smiles))) mol_iter = (mol_from_smiles(_smiles, _name, standardise=standardise) for _smiles, _name in smiles_generator(*smiles)) if prioritize: logging.info(("Prioritizing mols with low rotatable bond number" " and molecular weight first.")) mols_with_properties = [( AllChem.CalcNumRotatableBonds(mol), AllChem.CalcExactMolWt(mol), mol, ) for mol in mol_iter if mol is not None] data_iterator = make_data_iterator( (x[-1] for x in sorted(mols_with_properties))) else: data_iterator = make_data_iterator( (x for x in mol_iter if x is not None)) # Set up parallel-specific options logging.info("Parallel Type: {}".format(para.parallel_mode)) # Set other options touch_dir(out_dir) if not num_conf: num_conf = -1 logging.info("Out Directory: {}".format(out_dir)) logging.info("Overwrite Existing Files: {}".format(overwrite)) if values_file is not None: if os.path.exists(values_file) and overwrite is not True: value_args = (values_file, "a") logging.info("Values file: {} (append)".format((values_file))) else: value_args = (values_file, "w") logging.info("Values file: {} (new file)".format( (values_file))) if num_conf is None or num_conf == -1: logging.info("Target Conformer Number: auto") else: logging.info("Target Conformer Number: {:d}".format(num_conf)) if first is None or first == -1: logging.info("First Conformers Number: all") else: logging.info("First Conformers Number: {:d}".format(first)) logging.info("Pool Multiplier: {:d}".format(pool_multiplier)) logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff)) if max_energy_diff is None: logging.info("Maximum Energy Difference: None") else: logging.info("Maximum Energy Difference: {:.4g} kcal".format( max_energy_diff)) logging.info("Forcefield: {}".format(forcefield.upper())) if seed != -1: logging.info("Seed: {:d}".format(seed)) logging.info("Starting.") else: data_iterator = iter([]) gen_conf_kwargs = { "out_dir": out_dir, "num_conf": num_conf, "rmsd_cutoff": rmsd_cutoff, "max_energy_diff": max_energy_diff, "forcefield": forcefield, "pool_multiplier": pool_multiplier, "first": first, "seed": seed, "save": True, "overwrite": overwrite, "compress": compress, } run_kwargs = {"kwargs": gen_conf_kwargs} results_iterator = para.run_gen(generate_conformers, data_iterator, **run_kwargs) if para.is_master() and values_file is not None: hdf5_buffer = HDF5Buffer(*value_args) for result, data in results_iterator: if (para.is_master() and values_file is not None and result is not False): values_to_hdf5(hdf5_buffer, result) if para.is_master() and values_file is not None: hdf5_buffer.flush() hdf5_buffer.close()