Exemplo n.º 1
0
def run(smiles_file,
        bits=1024,
        radius=2,
        use_chiral=False,
        out_file="molecules.csv.bz2",
        log=None):
    setup_logging(log)

    smiles_dict = smiles_to_dict(smiles_file)
    mol_list_dict = {}
    for name, smiles in smiles_dict.iteritems():
        try:
            mol = mol_from_smiles(smiles, name)
            logging.info("Generating fingerprint for {}".format(name))
            fp = fprint2d_from_mol(mol,
                                   bits=bits,
                                   radius=radius,
                                   use_chiral=use_chiral)
            logging.info("Generated fingerprint for {}".format(name))
            mol_list_dict.setdefault(name,
                                     []).append(fprint_to_native_tuple(fp))

        except Exception:
            logging.warning("Fingerprinting {} failed.".format(name))
    fp_type = get_fprint2d_fptype(bits=bits,
                                  radius=radius,
                                  use_chiral=use_chiral)
    lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type)
Exemplo n.º 2
0
def main(smiles_file,
         params_file,
         sdf_dir=None,
         out_file="molecules.csv.bz2",
         log=None,
         num_proc=None,
         parallel_mode=None,
         verbose=False):
    """Fingerprint molecules."""
    setup_logging(log, verbose=verbose)
    parallelizer = Parallelizer(parallel_mode="processes")

    # set conformer generation and fingerprinting parameters
    confgen_params, fprint_params = params_to_dicts(params_file)
    kwargs = {"save": False, "fprint_params": fprint_params}

    smiles_dict = smiles_to_dict(smiles_file)
    mol_num = len({x.split('-')[0] for x in smiles_dict})

    if sdf_dir is not None:
        sdf_files = glob.glob(os.path.join(sdf_dir, "*.sdf*"))
        sdf_files = sorted(
            [x for x in sdf_files if name_from_sdf_filename(x) in smiles_dict])
        data_iter = make_data_iterator(sdf_files)
        fp_method = native_tuples_from_sdf
        logging.info("Using SDF files from {}".format(sdf_dir))
    else:
        kwargs["confgen_params"] = confgen_params
        data_iter = ((smiles, name)
                     for name, smiles in smiles_dict.iteritems())
        mol_num = len({x.split('-')[0] for x in smiles_dict})
        fp_method = native_tuples_from_smiles
        logging.info("Will generate conformers.")
        logging.info(
            "Conformer generation params: {!r}.".format(confgen_params))
    logging.info("Fingerprinting params: {!r}.".format(fprint_params))

    # fingerprint in parallel
    logging.info("Fingerprinting {:d} molecules".format(mol_num))
    mol_list_dict = {}
    for result, data in parallelizer.run_gen(fp_method,
                                             data_iter,
                                             kwargs=kwargs):
        if not result:
            logging.warning("Fingerprinting failed for {}.".format(data[0]))
            continue
        try:
            _, name = result[0]
            name = name.split('_')[0]
        except IndexError:
            logging.warning("Fingerprinting failed for {}.".format(data[0]))
            continue
        mol_list_dict[name] = result
    logging.info("Finished fingerprinting molecules")

    # save to SEA molecules file
    logging.info("Saving fingerprints to {}".format(out_file))
    fp_type = fprint_params_to_fptype(**fprint_params)
    lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type)
    logging.info("Finished!")
Exemplo n.º 3
0
def main(molecules_file1, molecules_file2, memmap_file, mol_names_file1,
         mol_names_file2, log=None, overwrite=False, parallel_mode=None,
         num_proc=None):
    setup_logging(log)
    logging.info("Reading first molecules file.")
    fp_array1, mol_names1, mol_indices_dict1 = read_convert_mols(
        molecules_file1)
    logging.info("Reading second molecules file.")
    fp_array2, mol_names2, mol_indices_dict2 = read_convert_mols(
        molecules_file2)

    if overwrite or not os.path.isfile(memmap_file):
        logging.info("Overwriting memmap file.")
        memmap = np.memmap(memmap_file, mode="w+", dtype=np.double,
                           shape=(len(mol_names1), len(mol_names2)))
        del memmap
        save_mol_names(mol_names_file1, mol_names1)
        save_mol_names(mol_names_file2, mol_names2)

    logging.info("Computing all pairwise Tanimotos.")

    para = Parallelizer(parallel_mode=parallel_mode, num_proc=num_proc)
    start_end_indices = get_start_end_inds(len(mol_names1), para.num_proc - 1)
    kwargs = {"fp_array1": fp_array1,
              "mol_names1": mol_names1,
              "mol_indices_dict1": mol_indices_dict1,
              "fp_array2": fp_array2,
              "mol_names2": mol_names2,
              "mol_indices_dict2": mol_indices_dict2,
              "memmap_file": memmap_file}
    para.run(run_batch, start_end_indices, kwargs=kwargs)
Exemplo n.º 4
0
def main(mfile1, mfile2, name1, name2, out_file, precision=PRECISION,
         log_freq=LOG_FREQ, num_proc=None, parallel_mode=None):
    setup_logging()
    if not out_file:
        out_file = (name1.lower().replace('\s', '_') + "_" +
                    name2.lower().replace('\s', '_') + "_tcs.csv.gz")

    # Load files
    mmap1 = load_mmap(mfile1)
    mmap2 = load_mmap(mfile2)
    if mmap1.shape != mmap2.shape:
        raise ValueError(
            "Memmaps do not have the same shape: {} {}".format(
                mmap1.shape, mmap2.shape))

    # Count binned pairs
    pair_num = mmap1.shape[0]
    del mmap1, mmap2

    para = Parallelizer(parallel_mode=parallel_mode, num_proc=num_proc)
    num_proc = max(para.num_proc - 1, 1)
    chunk_bounds = np.linspace(-1, pair_num - 1, num_proc + 1, dtype=int)
    chunk_bounds = list(zip(chunk_bounds[:-1] + 1, chunk_bounds[1:]))
    logging.info("Divided into {} chunks with ranges: {}".format(num_proc,
                                                                 chunk_bounds))

    logging.info("Counting TCs in chunks.")
    kwargs = {"mfile1": mfile1, "mfile2": mfile2, "precision": precision,
              "log_freq": log_freq}
    results_iter = para.run_gen(count_tcs, chunk_bounds, kwargs=kwargs)
    tc_pair_counts = Counter()
    for chunk_counts, _ in results_iter:
        if not isinstance(chunk_counts, dict):
            logging.error("Results are not in dict form.")
            continue
        tc_pair_counts.update(chunk_counts)

    # Write pairs to file
    logging.info("Writing binned pairs to {}.".format(out_file))
    mult = 10**precision
    with smart_open(out_file, "wb") as f:
        writer = csv.writer(f, delimiter=SEP)
        writer.writerow([name1, name2, "Count"])
        for pair in sorted(tc_pair_counts):
            writer.writerow([round(pair[0] / mult, precision),
                             round(pair[1] / mult, precision),
                             tc_pair_counts[pair]])

    total_counts = sum(tc_pair_counts.values())
    if total_counts != pair_num:
        logging.warning(
            "Pair counts {} did not match expected number {}".format(
                total_counts, pair_num))
        return
    logging.info("Completed.")
Exemplo n.º 5
0
def main(mol_dbase,
         combo_tcs_file='fastrocs_combo_tcs.bin',
         shape_tcs_file='fastrocs_shape_tcs.bin',
         mol_names_file='fastrocs_mol_names.csv',
         overwrite=False,
         log=None,
         log_freq=100):
    setup_logging(log)

    logging.info("Will save tcs to {} and {} and mol names to {}".format(
        combo_tcs_file, shape_tcs_file, mol_names_file))

    ifs = oemolistream()
    if not ifs.open(mol_dbase):
        OEThrow.Fatal("Unable to open {} for reading".format(mol_dbase))

    # Configure OpenEye
    dbtype = OEShapeDatabaseType_Default
    options = OEShapeDatabaseOptions()
    options.SetScoreType(dbtype)
    combo_tc_getter = OEShapeDatabaseScore.GetTanimotoCombo
    shape_tc_getter = OEShapeDatabaseScore.GetShapeTanimoto

    combo_tcs = []
    shape_tcs = []
    dots = OEDots(log_freq, 20, "looping through molecule scores")
    last_name = None
    search_db = None
    for index, mol in enumerate(ifs.GetOEMols()):
        conf_name = mol.GetTitle()
        proto_name = conf_name.split("_")[0]
        if proto_name != last_name:
            last_name = proto_name
            search_db = OEShapeDatabase(dbtype)
            search_db.AddMol(mol)
            continue

        combo_tc = shape_tc = 0
        i = 0
        for conf in mol.GetConfs():
            for score in search_db.GetScores(conf, options):
                combo_tc = combo_tc_getter(score)
                shape_tc = shape_tc_getter(score)
                if i > 0:
                    sys.exit("More than one conformer was found in database.")
                i += 1

        dots.Update()
        combo_tcs.append(combo_tc)
        shape_tcs.append(shape_tc)
        assert (len(combo_tcs) == (index + 1) / 2)

    np.asarray(combo_tcs, dtype=np.double).tofile(combo_tcs_file, format="d")
    np.asarray(shape_tcs, dtype=np.double).tofile(shape_tcs_file, format="d")
Exemplo n.º 6
0
def main(molecules_file="molecules.csv.bz2",
         targets_file="targets.csv.bz2",
         k=5,
         method='sea',
         tc_files=None,
         auc_type='sum',
         process_inputs=None,
         split_by='target',
         reduce_negatives=False,
         min_mols=50,
         affinity=10000,
         out_dir="./",
         overwrite=False,
         log=None,
         num_proc=None,
         parallel_mode=None,
         verbose=False):
    setup_logging(log, verbose=verbose)
    if num_proc is None:
        num_proc = k + 1
    parallelizer = Parallelizer(parallel_mode=parallel_mode, num_proc=num_proc)

    cv_class = CV_METHODS[method]
    if cv_class is MaxTanimotoCVMethod and tc_files is not None:
        score_matrix = ScoreMatrix(*tc_files)
        cv_class = cv_class(score_matrix)
    splitter_class = SPLITTERS[split_by]
    if isinstance(splitter_class, MoleculeSplitter):
        splitter = splitter_class(k=k)
    else:
        splitter = splitter_class(k=k, reduce_negatives=reduce_negatives)

    if process_inputs is not None:
        processor = InputProcessor(mode=process_inputs)
    else:
        processor = None

    kfold_cv = KFoldCrossValidator(k=5,
                                   parallelizer=parallelizer,
                                   splitter=splitter,
                                   input_processor=processor,
                                   cv_method=cv_class,
                                   return_auc_type=auc_type,
                                   out_dir=out_dir,
                                   overwrite=overwrite)
    auc = kfold_cv.run(molecules_file,
                       targets_file,
                       min_mols=min_mols,
                       affinity=affinity)
    logging.info("CV Mean AUC: {:.4f}".format(auc))
def main(targets_map_file,
         all_molecules_file,
         all_targets_file,
         fit_file=None,
         sample=None,
         affinity=None,
         log=None,
         out_dir='./',
         verbose=False):
    setup_logging(log, verbose=verbose)
    library_from_map(targets_map_file,
                     all_molecules_file,
                     all_targets_file,
                     fit_file=fit_file,
                     sample=sample,
                     affinity=affinity,
                     out_dir=out_dir)
Exemplo n.º 8
0
def main(molecules_file,
         library_file,
         target_results_pickle=TARGET_RESULTS_PICKLE_DEF,
         mol_results_pickle=MOL_RESULTS_PICKLE_DEF,
         log_file=None,
         verbose=False):
    setup_logging(log_file, verbose=verbose)
    logging.info("Loading molecules file")
    smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
        molecules_file)
    del smiles_dict, fp_type
    logging.info("Running SEA searches with {:d} molecules.".format(
        len(mol_lists_dict)))
    set_searcher = sea_set_search(library_file, mol_lists_dict)
    logging.info("Saving results to pickles.")
    with smart_open(target_results_pickle, "wb") as f:
        pickle.dump(set_searcher.target_results_dict, f)
    with smart_open(mol_results_pickle, "wb") as f:
        pickle.dump(set_searcher.set_results_dict, f)
Exemplo n.º 9
0
def main(sdf_file,
         save_freq=SAVE_FREQ,
         overwrite=False,
         log=None,
         parallel_mode=None,
         num_proc=None):
    setup_logging(log)
    logging.info("Reading mols from SDF.")
    supp = rdkit.Chem.SDMolSupplier(sdf_file)
    num_mol = len(supp)
    del supp

    para = Parallelizer(parallel_mode=parallel_mode, num_proc=num_proc)
    start_end_indices = get_triangle_indices(num_mol, para.num_proc - 1)
    kwargs = {
        "sdf_file": sdf_file,
        "save_freq": save_freq,
        "overwrite": overwrite
    }
    para.run(run_batch, start_end_indices, kwargs=kwargs)
def main(molecules_file,
         log=None,
         overwrite=False,
         parallel_mode=None,
         num_proc=None,
         merge_confs=False,
         save_freq=SAVE_FREQ,
         compress=False):
    setup_logging(log)
    para = Parallelizer(parallel_mode=parallel_mode, num_proc=num_proc)
    if para.is_master():
        data_iter = ((molecules_file, i, para.num_proc - 1)
                     for i in range(para.num_proc - 1))
    else:
        data_iter = iter([])

    kwargs = {
        "overwrite": overwrite,
        "merge_confs": merge_confs,
        "save_freq": save_freq,
        "compress": compress
    }
    para.run(run_batch, data_iter, kwargs=kwargs)
def main(mol_dbase,
         start_index,
         end_index,
         skip_inds=set(),
         skip_next=False,
         save_freq=SAVE_FREQ,
         overwrite=False,
         merge_confs=False,
         verbose=False,
         compress=False):
    base_output_name_strings = [
        'start-{0}'.format(start_index), 'end-{0}'.format(end_index)
    ]
    if compress:
        binext = ".bin.gz"
        csvext = ".csv.gz"
    else:
        binext = ".bin"
        csvext = ".csv"

    log_file = ('_'.join(['fastrocs_log'] + base_output_name_strings) + ".txt")
    max_combo_tcs_file = ('_'.join(['fastrocs_max_combo_tcs'] +
                                   base_output_name_strings)) + binext
    max_shape_tcs_file = ('_'.join(['fastrocs_max_shape_tcs'] +
                                   base_output_name_strings)) + binext
    mol_names_file = ('_'.join(['mol_names'] + base_output_name_strings) +
                      csvext)
    if overwrite:
        safe_unlink(log_file)
    setup_logging(log_file, verbose=verbose)

    batch_size = get_batch_size(start_index, end_index)
    logging.info(
        "Will save {} max tcs to {} and {} and mol names to {}".format(
            batch_size, max_combo_tcs_file, max_shape_tcs_file,
            mol_names_file))

    total_pairs_searched = 0
    last_save_ind = -1

    # Remove files or resume
    if overwrite:
        logging.info("Removing old files.")
        safe_unlink(max_combo_tcs_file)
        safe_unlink(max_shape_tcs_file)
        safe_unlink(mol_names_file)
    elif all_exist(max_combo_tcs_file, max_shape_tcs_file, mol_names_file):
        logging.info("Resuming from existing files.")
        existing_index = get_num_mols(mol_names_file)
        last_save_ind = existing_index - 1
        total_pairs_searched = get_batch_size(start_index, existing_index - 1)
        start_index = existing_index
        logging.info("Found {0} mol names. Resuming from index {0}.".format(
            existing_index))
    elif any_exist(max_combo_tcs_file, max_shape_tcs_file, mol_names_file):
        sys.exit("Not all files exist, so cannot resume from old run.")

    if skip_next:
        skip_inds.add(start_index)
    if len(skip_inds) > 1:
        logging.debug("Will skip indices: {}".format(skip_inds))

    ifs = oemolistream()
    if not ifs.open(mol_dbase):
        OEThrow.Fatal("Unable to open {} for reading".format(mol_dbase))
    if merge_confs:
        ifs.SetConfTest(OEAbsoluteConfTest())  # detect and merge conformers
        mols_iter = group_mols_by_name(ifs.GetOEMols())
    else:
        mols_iter = ((x.GetTitle(), [x]) for x in ifs.GetOEMols())

    # Configure OpenEye
    dbtype = OEShapeDatabaseType_Default
    options = OEShapeDatabaseOptions()
    options.SetScoreType(dbtype)
    search_db = OEShapeDatabase(dbtype)
    combo_tc_getter = OEShapeDatabaseScore.GetTanimotoCombo
    shape_tc_getter = OEShapeDatabaseScore.GetShapeTanimoto

    max_combo_tcs_tril = [[]]
    max_shape_tcs_tril = [[]]
    search_mol_names = []
    mol_idx_to_index = []
    pairs_since_last_save = 0
    dots = OEDots(save_freq, 20, "looping through molecule scores")
    for index, (mol_name, mols) in enumerate(mols_iter):
        logging.debug("Mol {} ({})".format(index, mol_name))

        mol_idx_to_index.extend([index] * len(mols))
        if search_mol_names and index >= start_index:
            if index not in skip_inds:
                logging.debug("Scoring mol {}".format(index))
                max_combo_tcs = [0.0] * len(search_mol_names)
                max_shape_tcs = [0.0] * len(search_mol_names)
                for conf in (c for mol in mols for c in mol.GetConfs()):
                    for score in search_db.GetScores(conf, options):
                        mol_idx = score.GetMolIdx()
                        mol_id = mol_idx_to_index[mol_idx]
                        max_combo_tcs[mol_id] = max(max_combo_tcs[mol_id],
                                                    combo_tc_getter(score))
                        max_shape_tcs[mol_id] = max(max_shape_tcs[mol_id],
                                                    shape_tc_getter(score))
                logging.debug("Finished scoring mol {}".format(index))
            else:
                logging.info("Skipping index {} ({}) as requested.".format(
                    index, mol_name))
                max_combo_tcs = [-1.0] * len(search_mol_names)
                max_shape_tcs = [-1.0] * len(search_mol_names)
            max_combo_tcs_tril.append(max_combo_tcs)
            max_shape_tcs_tril.append(max_shape_tcs)
            pairs_since_last_save += len(search_mol_names)

        # Add mol to search mols
        search_mol_names.append(mol_name)
        logging.debug("Adding mol {} to search db".format(index))
        for mol in mols:
            search_db.AddMol(mol)
        logging.debug("Finished adding mol {} to search db".format(index))
        dots.Update()

        # Cache results to file
        if (search_mol_names and
            ((index >= start_index and
              (pairs_since_last_save >= save_freq
               or end_index and index >= end_index)) or index == start_index)):
            total_pairs_searched += pairs_since_last_save
            perc_complete = total_pairs_searched / float(batch_size)
            logging.info(
                ("{} molecules recorded. Appending shape tcs to {}, "
                 "combo tcs to {}, and mol names to {}. ({:.4%})").format(
                     len(search_mol_names), max_shape_tcs_file,
                     max_combo_tcs_file, mol_names_file, perc_complete))
            cache_tcs_to_binary(max_combo_tcs_file, max_combo_tcs_tril)
            cache_tcs_to_binary(max_shape_tcs_file, max_shape_tcs_tril)
            cache_mol_names(mol_names_file,
                            search_mol_names[last_save_ind + 1:])
            pairs_since_last_save = 0
            last_save_ind = index

        if end_index and index >= end_index:
            logging.info("Ending at index {0} as requested ({1})".format(
                index, end_index))
            return 0
    return 0
Exemplo n.º 12
0
def main(query_molecules_file,
         query_targets_file,
         target_molecules_file,
         target_targets_file,
         method=SEASearchCVMethod,
         fit_file=None,
         log=None,
         out_dir="./"):
    setup_logging(log)

    method = method()
    method.out_dir = out_dir
    touch_dir(out_dir)
    if fit_file is None:
        fit_file = os.path.join(out_dir, "library.fit")

    logging.info("Loading target files.")
    if isinstance(method, SEASearchCVMethod):
        method.fit_file = fit_file
        (_, target_targets_dict, target_smiles_dict, target_mol_list_dict,
         target_fp_type, target_target_list,
         target_mol_list) = process_input_files(target_molecules_file,
                                                target_targets_file,
                                                sea_format=True)

        logging.info("Saving target SEA files.")
        dict_to_targets(method.train_targets_file, target_targets_dict)
        lists_dicts_to_molecules(method.train_molecules_file,
                                 target_smiles_dict, target_mol_list_dict,
                                 target_fp_type)

        target_fp_array = None
        target_mol_to_fp_inds = None
        target_target_mol_array = None
        mask = None
    else:
        (target_fp_array, target_mol_to_fp_inds, target_target_mol_array,
         target_target_list,
         target_mol_list) = process_input_files(target_molecules_file,
                                                target_targets_file,
                                                sea_format=False)
        mask = np.ones_like(target_target_mol_array, dtype=np.bool_)

    method.train(target_fp_array,
                 target_mol_to_fp_inds,
                 target_target_mol_array,
                 target_target_list,
                 target_mol_list,
                 mask=mask)

    logging.info("Loading query files.")
    if isinstance(method, SEASearchCVMethod):
        (query_target_mol_array, query_targets_dict, query_smiles_dict,
         query_mol_list_dict, query_fp_type, query_target_list,
         query_mol_list) = process_input_files(query_molecules_file,
                                               query_targets_file,
                                               sea_format=True)

        logging.info("Saving query SEA files.")
        lists_dicts_to_molecules(method.test_molecules_file, query_smiles_dict,
                                 query_mol_list_dict, query_fp_type)

        query_fp_array = None
        query_mol_to_fp_inds = None
    else:
        (query_fp_array, query_mol_to_fp_inds, query_target_mol_array,
         query_target_list,
         query_mol_list) = process_input_files(query_molecules_file,
                                               query_targets_file,
                                               sea_format=False)

    mask = np.ones_like(query_target_mol_array, dtype=np.bool_)
    results = method.test(query_fp_array,
                          query_mol_to_fp_inds,
                          query_target_mol_array,
                          query_target_list,
                          query_mol_list,
                          mask=mask)

    y_true = query_target_mol_array.ravel()
    y_score = results.ravel()
    nan_inds = np.where(~np.isnan(y_score))
    y_true, y_score = y_true[nan_inds], y_score[nan_inds]

    logging.info("Computing results curves.")
    roc_file, prc_file, enrich_file = [
        os.path.join(out_dir, "combined_{}.pkl.bz2".format(x))
        for x in ["roc", "prc", "enrichment"]
    ]

    logging.info("Computing ROC curves.")
    roc = roc_curve(y_true, y_score, drop_intermediate=True)
    auroc = auc(roc[0], roc[1])
    with smart_open(roc_file, "wb") as f:
        pkl.dump(roc, f, pkl.HIGHEST_PROTOCOL)
    logging.info("AUROC: {:.4f}".format(auroc))

    logging.info("Computing PRC curves.")
    prc_rec = precision_recall_curve(y_true, y_score)
    prc = (prc_rec[1], prc_rec[0], prc_rec[2])
    auprc = auc(prc[0], prc[1])
    imbalance = get_imbalance(y_true)
    with smart_open(prc_file, "wb") as f:
        pkl.dump(prc, f, pkl.HIGHEST_PROTOCOL)
    logging.info("AUPRC: {:.4f} ({:.4f} of data is positive)".format(
        auprc, imbalance))

    logging.info("Computing enrichment curves.")
    enrichment = enrichment_curve(y_true, y_score)
    with smart_open(enrich_file, "wb") as f:
        pkl.dump(enrichment, f, pkl.HIGHEST_PROTOCOL)
    auec = auc(enrichment[0], enrichment[1])
    logging.info("AUE: {:.4f}".format(auec))
Exemplo n.º 13
0
EXCLUDE_FLOATING_DEF = get_default_value("fingerprinting", "exclude_floating",
                                         bool)
IDENT_DTYPE = np.int64  # np.dtype to use for identifiers
Y_AXIS_PRECISION = 0.1  # angstroms
Z_AXIS_PRECISION = 0.01  # rad
POLAR_CONE_RAD = np.pi / 36  # rad
MMH3_SEED = 0
BOND_TYPES = {
    None: 5,
    Chem.BondType.SINGLE: 1,
    Chem.BondType.DOUBLE: 2,
    Chem.BondType.TRIPLE: 3,
    Chem.BondType.AROMATIC: 4
}

setup_logging(reset=False)


class Fingerprinter(object):
    """E3FP fingerprint generator.

    Parameters
    ----------
    bits : int or None, optional
        Maximum number of bits to which to fold returned fingerprint. Multiple
        of 2 is strongly recommended.
    level : int or None, optional
        Maximum number of iterations for fingerprint generation. If None or
        -1, run until no new substructures are identified. Because this could
        produce a different final level number for each conformer, it is
        recommended to manually specify a level.
Exemplo n.º 14
0
def run(sdf_files,
        bits=BITS,
        first=FIRST_DEF,
        level=LEVEL_DEF,
        radius_multiplier=RADIUS_MULTIPLIER_DEF,
        counts=COUNTS_DEF,
        stereo=STEREO_DEF,
        include_disconnected=INCLUDE_DISCONNECTED_DEF,
        rdkit_invariants=RDKIT_INVARIANTS_DEF,
        exclude_floating=EXCLUDE_FLOATING_DEF,
        params=None,
        out_dir_base=None,
        out_ext=OUT_EXT_DEF,
        db_file=None,
        overwrite=False,
        all_iters=False,
        log=None,
        num_proc=None,
        parallel_mode=None,
        verbose=False):
    """Generate E3FP fingerprints from SDF files."""
    setup_logging(log, verbose=verbose)

    if params is not None:
        params = read_params(params, fill_defaults=True)
        bits = get_value(params, "fingerprinting", "bits", int)
        first = get_value(params, "fingerprinting", "first", int)
        level = get_value(params, "fingerprinting", "level", int)
        radius_multiplier = get_value(params, "fingerprinting",
                                      "radius_multiplier", float)
        counts = get_value(params, "fingerprinting", "counts", bool)
        stereo = get_value(params, "fingerprinting", "stereo", bool)
        include_disconnected = get_value(params, "fingerprinting",
                                         "include_disconnected", bool)
        rdkit_invariants = get_value(params, "fingerprinting",
                                     "rdkit_invariants", bool)
        exclude_floating = get_value(params, "fingerprinting",
                                     "exclude_floating", bool)

    para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode)

    if para.rank == 0:
        logging.info("Initializing E3FP generation.")
        logging.info("Getting SDF files")

        if len(sdf_files) == 1 and os.path.isdir(sdf_files[0]):
            from glob import glob
            sdf_files = glob("{:s}/*sdf*".format(sdf_files[0]))

        data_iterator = make_data_iterator(sdf_files)

        logging.info("SDF File Number: {:d}".format(len(sdf_files)))
        if out_dir_base is not None:
            logging.info("Out Directory Basename: {:s}".format(out_dir_base))
            logging.info("Out Extension: {:s}".format(out_ext))
        if db_file is not None:
            logging.info("Database File: {:s}".format(db_file))
        if db_file is None and out_dir_base is None:
            sys.exit('Either `db_file` or `out_dir_base` must be specified.')
        logging.info("Max First Conformers: {:d}".format(first))
        logging.info("Bits: {:d}".format(bits))
        logging.info("Level/Max Iterations: {:d}".format(level))
        logging.info(
            "Shell Radius Multiplier: {:.4g}".format(radius_multiplier))
        logging.info("Stereo Mode: {!s}".format(stereo))
        if include_disconnected:
            logging.info("Connected-only mode: on")
        if rdkit_invariants:
            logging.info("Invariant type: RDKit")
        else:
            logging.info("Invariant type: Daylight")
        logging.info("Parallel Mode: {!s}".format(para.parallel_mode))
        logging.info("Starting")
    else:
        data_iterator = iter([])

    fp_kwargs = {
        "first": first,
        "bits": bits,
        "level": level,
        "radius_multiplier": radius_multiplier,
        "stereo": stereo,
        "counts": counts,
        "include_disconnected": include_disconnected,
        "rdkit_invariants": rdkit_invariants,
        "exclude_floating": exclude_floating,
        "out_dir_base": out_dir_base,
        "out_ext": out_ext,
        "all_iters": all_iters,
        "overwrite": overwrite,
        "save": False
    }
    if out_dir_base is not None:
        fp_kwargs['save'] = True

    run_kwargs = {"kwargs": fp_kwargs}

    results_iter = para.run_gen(fprints_dict_from_sdf, data_iterator,
                                **run_kwargs)

    if db_file is not None:
        fprints = []
        for result, data in results_iter:
            try:
                fprints.extend(result.get(level, result[max(result.keys())]))
            except (AttributeError, ValueError):
                # fprinting failed, assume logged in method
                continue
        if len(fprints) > 0:
            db = FingerprintDatabase(fp_type=type(fprints[0]), level=level)
            db.add_fingerprints(fprints)
            db.save(db_file)
            logging.info("Saved fingerprints to {:s}".format(db_file))
    else:
        list(results_iter)
        f.write("{0},{1}\t{2:.{4}f}\t{3:.{4}f}\n".format(
            pair[0], pair[1], tcs[0], tcs[1], precision))
    f.close()
    del example_pairs

    return counts_file, examples_file


if __name__ == "__main__":
    usage = "python compare_fingerprints.py <ecfp4_molecules> <e3fp_molecules>"
    try:
        ecfp_mol_file, e3fp_mol_file = sys.argv[1:]
    except ValueError:
        sys.exit(usage)

    setup_logging("log.txt")
    para = Parallelizer(parallel_mode="mpi")
    if para.rank == 0:
        logging.info("Reading molecules")
    ecfp_fp_sets = molecules_to_fp_sets(ecfp_mol_file)
    e3fp_fp_sets = molecules_to_fp_sets(e3fp_mol_file)

    mutual_mols = sorted(set(ecfp_fp_sets.keys()) & set(e3fp_fp_sets.keys()))
    mol_num = int(MOL_FRAC * len(mutual_mols))
    mols = mutual_mols[:mol_num]

    if para.rank == 0:
        logging.info(
            "Found total of {} mols. Selecting {} for comparison.".format(
                len(mutual_mols), mol_num))
    mols = sorted(np.random.choice(mutual_mols, size=mol_num, replace=False))
Exemplo n.º 16
0
def run(
    mol2=None,
    smiles=None,
    standardise=STANDARDISE_DEF,
    num_conf=NUM_CONF_DEF,
    first=FIRST_DEF,
    pool_multiplier=POOL_MULTIPLIER_DEF,
    rmsd_cutoff=RMSD_CUTOFF_DEF,
    max_energy_diff=MAX_ENERGY_DIFF_DEF,
    forcefield=FORCEFIELD_DEF,
    seed=SEED_DEF,
    params=None,
    prioritize=False,
    out_dir=OUTDIR_DEF,
    compress=COMPRESS_DEF,
    overwrite=False,
    values_file=None,
    log=None,
    num_proc=None,
    parallel_mode=None,
    verbose=False,
):
    """Run conformer generation."""
    setup_logging(log, verbose=verbose)

    if params is not None:
        params = read_params(params)
        standardise = get_value(params, "preprocessing", "standardise", bool)
        num_conf = get_value(params, "conformer_generation", "num_conf", int)
        first = get_value(params, "conformer_generation", "first", int)
        pool_multiplier = get_value(params, "conformer_generation",
                                    "pool_multiplier", int)
        rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff",
                                float)
        max_energy_diff = get_value(params, "conformer_generation",
                                    "max_energy_diff", float)
        forcefield = get_value(params, "conformer_generation", "forcefield")
        seed = get_value(params, "conformer_generation", "seed", int)

    # check args
    if forcefield not in FORCEFIELD_CHOICES:
        raise ValueError(
            "Specified forcefield {} is not in valid options {!r}".format(
                forcefield, FORCEFIELD_CHOICES))

    para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode)

    # Check to make sure args make sense
    if mol2 is None and smiles is None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide mol2 file or a SMILES file.")
        sys.exit()

    if mol2 is not None and smiles is not None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide only a mol2 file OR a SMILES file.")
        sys.exit()

    if num_proc and num_proc < 1:
        if para.is_master():
            parser.print_usage()
            logging.error(
                "Please provide more than one processor with `--num_proc`.")
        sys.exit()

    # Set up input type
    if mol2 is not None:
        in_type = "mol2"
    elif smiles is not None:
        in_type = "smiles"

    if para.is_master():
        if in_type == "mol2":
            logging.info("Input type: mol2 file(s)")
            logging.info("Input file number: {:d}".format(len(mol2)))
            mol_iter = (mol_from_mol2(_mol2_file,
                                      _name,
                                      standardise=standardise)
                        for _mol2_file, _name in mol2_generator(*mol2))
        else:
            logging.info("Input type: Detected SMILES file(s)")
            logging.info("Input file number: {:d}".format(len(smiles)))
            mol_iter = (mol_from_smiles(_smiles,
                                        _name,
                                        standardise=standardise)
                        for _smiles, _name in smiles_generator(*smiles))

        if prioritize:
            logging.info(("Prioritizing mols with low rotatable bond number"
                          " and molecular weight first."))
            mols_with_properties = [(
                AllChem.CalcNumRotatableBonds(mol),
                AllChem.CalcExactMolWt(mol),
                mol,
            ) for mol in mol_iter if mol is not None]
            data_iterator = make_data_iterator(
                (x[-1] for x in sorted(mols_with_properties)))
        else:
            data_iterator = make_data_iterator(
                (x for x in mol_iter if x is not None))

        # Set up parallel-specific options
        logging.info("Parallel Type: {}".format(para.parallel_mode))

        # Set other options
        touch_dir(out_dir)

        if not num_conf:
            num_conf = -1

        logging.info("Out Directory: {}".format(out_dir))
        logging.info("Overwrite Existing Files: {}".format(overwrite))
        if values_file is not None:
            if os.path.exists(values_file) and overwrite is not True:
                value_args = (values_file, "a")
                logging.info("Values file: {} (append)".format((values_file)))
            else:
                value_args = (values_file, "w")
                logging.info("Values file: {} (new file)".format(
                    (values_file)))
        if num_conf is None or num_conf == -1:
            logging.info("Target Conformer Number: auto")
        else:
            logging.info("Target Conformer Number: {:d}".format(num_conf))
        if first is None or first == -1:
            logging.info("First Conformers Number: all")
        else:
            logging.info("First Conformers Number: {:d}".format(first))
        logging.info("Pool Multiplier: {:d}".format(pool_multiplier))
        logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff))
        if max_energy_diff is None:
            logging.info("Maximum Energy Difference: None")
        else:
            logging.info("Maximum Energy Difference: {:.4g} kcal".format(
                max_energy_diff))
        logging.info("Forcefield: {}".format(forcefield.upper()))
        if seed != -1:
            logging.info("Seed: {:d}".format(seed))

        logging.info("Starting.")
    else:
        data_iterator = iter([])

    gen_conf_kwargs = {
        "out_dir": out_dir,
        "num_conf": num_conf,
        "rmsd_cutoff": rmsd_cutoff,
        "max_energy_diff": max_energy_diff,
        "forcefield": forcefield,
        "pool_multiplier": pool_multiplier,
        "first": first,
        "seed": seed,
        "save": True,
        "overwrite": overwrite,
        "compress": compress,
    }

    run_kwargs = {"kwargs": gen_conf_kwargs}

    results_iterator = para.run_gen(generate_conformers, data_iterator,
                                    **run_kwargs)

    if para.is_master() and values_file is not None:
        hdf5_buffer = HDF5Buffer(*value_args)

    for result, data in results_iterator:
        if (para.is_master() and values_file is not None
                and result is not False):
            values_to_hdf5(hdf5_buffer, result)

    if para.is_master() and values_file is not None:
        hdf5_buffer.flush()
        hdf5_buffer.close()
Exemplo n.º 17
0
def main(job_id, params, main_conf_dir=MAIN_CONF_DIR, main_dir=CV_DIR,
         out_dir=None, smiles_file=SMILES_FILE, check_existing=True,
         mol_targets_file=MOL_TARGETS_FILE, k=CV_K, log_file=LOG_FILE,
         verbose=False, overwrite=False, min_mols=MIN_MOLS_PER_TARGET,
         parallelizer=None):
    params = format_params(params)

    pre_encoding_params_string = params_to_str(params, with_first=False)
    params_string = params_to_str(params)
    if out_dir is None:
        out_dir = os.path.join(main_dir, params_string)
    touch_dir(out_dir)
    if log_file is not None:
        log_file = os.path.join(out_dir, log_file)
    setup_logging(log_file, verbose=verbose)

    params_file = os.path.join(out_dir, "params.cfg")
    config_parser = update_params(params, section_name="fingerprinting")
    write_params(config_parser, params_file)

    if not isinstance(parallelizer, Parallelizer):
        parallelizer = Parallelizer(parallel_mode="processes",
                                    num_proc=NUM_PROC)

    logging.info("Params: {!r}".format(params.items()))
    logging.info("Saving files to {:s}.".format(out_dir))

    logging.info("Checking for usable pre-existing fingerprints.")
    existing_molecules_file = get_existing_fprints(pre_encoding_params_string,
                                                   params['first'], main_dir)

    molecules_file = get_molecules_file(out_dir)
    if os.path.isfile(molecules_file) and not overwrite:
        logging.info("Molecules file already exists. Loading.")
        smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
            molecules_file)
    elif existing_molecules_file is None:
        conf_dir = os.path.join(main_conf_dir, params['conformers'])
        logging.info("Generating fingerprints from conformers in "
                     "{!s}.".format(conf_dir))
        smiles_dict, mol_lists_dict, fp_type = params_to_molecules(
            params, smiles_file, conf_dir, out_dir, parallelizer=parallelizer)
    else:
        logging.info("Using native strings from existing molecules "
                     "file {!s}.".format(existing_molecules_file))
        smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
            existing_molecules_file, first=params['first'])
        lists_dicts_to_molecules(get_molecules_file(out_dir),
                                 smiles_dict, mol_lists_dict, fp_type)

    targets_file = get_targets_file(out_dir)
    if overwrite or not os.path.isfile(targets_file):
        logging.info("Reading targets from {!s}.".format(mol_targets_file))
        targets_dict = targets_to_dict(mol_targets_file, affinity=AFFINITY)
        logging.debug("Read {:d} targets.".format(len(targets_dict)))
        logging.info("Filtering targets by molecules.")
        filtered_targets_dict = targets_to_mol_lists_targets(
            filter_targets_by_molecules(targets_dict, mol_lists_dict),
            mol_lists_dict)

        del targets_dict, smiles_dict, mol_lists_dict, fp_type
        logging.info("Saving filtered targets to {!s}.".format(targets_file))
        dict_to_targets(targets_file, filtered_targets_dict)
        del filtered_targets_dict
    else:
        logging.info("Targets file already exists. Skipping.")

    parallel_mode = parallelizer.parallel_mode
    parallelizer = Parallelizer(parallel_mode=parallel_mode, num_proc=k + 1)

    splitter = ByTargetMoleculeSplitter(k, reduce_negatives=REDUCE_NEGATIVES)
    kfold_cv = KFoldCrossValidator(k=k, parallelizer=parallelizer,
                                   splitter=splitter,
                                   return_auc_type=AUC_TYPE, out_dir=out_dir,
                                   overwrite=False)
    auc = kfold_cv.run(molecules_file, targets_file, min_mols=min_mols,
                       affinity=AFFINITY)
    logging.info("CV Mean AUC: {:.4f}".format(auc))
    return 1 - auc
Exemplo n.º 18
0
                               'classifiers_mean')
ECFP_MEAN_NB_DIR = os.path.join(CV_MEAN_BASEDIR, "ecfp4_nbc")
ECFP_MEAN_RF_DIR = os.path.join(CV_MEAN_BASEDIR, "ecfp4_rf")
ECFP_MEAN_SVM_DIR = os.path.join(CV_MEAN_BASEDIR, "ecfp4_linsvm")
ECFP_MEAN_NN_DIR = os.path.join(CV_MEAN_BASEDIR, "ecfp4_nn")
E3FP_MEAN_NB_DIR = os.path.join(CV_MEAN_BASEDIR, "e3fp_nbc")
E3FP_MEAN_RF_DIR = os.path.join(CV_MEAN_BASEDIR, "e3fp_rf")
E3FP_MEAN_SVM_DIR = os.path.join(CV_MEAN_BASEDIR, "e3fp_linsvm")
E3FP_MEAN_NN_DIR = os.path.join(CV_MEAN_BASEDIR, "e3fp_nn")
SKLEARN_PRC_BASENAME = "fig_s5"
SKLEARN_ROC_BASENAME = "fig_s6"

SEA_ROC_YMIN = .969
SKLEARN_ROC_YMIN = .874
FRAC_POS = .0051
setup_logging(verbose=True)


def get_best_fold_by_prc(cv_dir):
    """Get number of best fold by fold AUPRC."""
    log_file = os.path.join(cv_dir, "log.txt")
    logging.debug("Opening log file: {}".format(log_file))
    fold_results = []
    with open(log_file, "r") as f:
        for line in f:
            if "|Fold " in line:
                fold = int(line.split('Fold ')[1].split()[0])
                auprc = float(line.split('AUPRC of ')[1].split()[0][:-1])
                fold_results.append((auprc, fold))
    return sorted(fold_results)[-1][1]
Exemplo n.º 19
0
ECFP_TC_CUTOFF = 0.3
OUTFILE = "unique_hits_aff{:d}_3dpval{:.4g}_2dpval{:.4g}.txt".format(
    E3FP_MIN_AFFINITY, E3FP_MAX_PVALUE, ECFP_MIN_PVALUE)


def reformat_mol_results(mol_results_dict):
    new_mol_results = {}
    for mol_name, hit_dict in mol_results_dict.iteritems():
        for target_key, result_tuple in hit_dict.iteritems():
            new_mol_results.setdefault(mol_name, {}).setdefault(
                target_key.tid, {})[int(target_key.group)] = result_tuple
    return new_mol_results


if __name__ == "__main__":
    setup_logging()

    logging.info("Loading and reformatting E3FP results.")
    e3fp_mol_results = reformat_mol_results(
        pickle.load(smart_open("e3fp/mol_results.pkl.bz2", "rb")))
    logging.info("Loading and reformatting ECFP4 results.")
    ecfp4_mol_results = reformat_mol_results(
        pickle.load(smart_open("ecfp4/mol_results.pkl.bz2", "rb")))

    logging.info("Getting valid mol/target pairs.")
    e3fp_unique_mol_results = {}
    for mol_name, hit_dict in e3fp_mol_results.iteritems():
        for tid, affinity_results in hit_dict.iteritems():
            if (mol_name not in ecfp4_mol_results
                    or tid not in ecfp4_mol_results[mol_name]
                    or ecfp4_mol_results[mol_name][tid][max(