示例#1
0
    def test_save_txt(self):
        """Ensure bitstrings saved to txt correctly."""
        from e3fp.fingerprint.db import FingerprintDatabase
        from python_utilities.io_tools import smart_open

        array = np.array(
            [[1, 0, 0, 1, 1], [0, 0, 0, 1, 0], [0, 1, 1, 1, 1]], dtype=np.bool_
        )
        db = FingerprintDatabase.from_array(array, ["1", "2", "3"])

        desc, txt_file = tempfile.mkstemp(suffix=".txt.gz")
        os.close(desc)
        db.savetxt(txt_file)
        exp_bitstring = b"10011 1\n00010 2\n01111 3\n"
        with smart_open(txt_file, "r") as f:
            bitstring = f.read()
        self.assertEqual(bitstring, exp_bitstring)
        os.unlink(txt_file)

        desc, txt_file = tempfile.mkstemp(suffix=".txt.gz")
        os.close(desc)
        db.savetxt(txt_file, with_names=False)
        exp_bitstring = b"10011\n00010\n01111\n"
        with smart_open(txt_file, "r") as f:
            bitstring = f.read()
        self.assertEqual(bitstring, exp_bitstring)
        os.unlink(txt_file)
示例#2
0
def main(sdf_dir, mol_file, num_pairs=10000,
         out_sdf_file="random_pairs.sdf.bz2"):
    logging.info("Loading molecules file.")
    smiles_dict, mol_list_dict, fp_type = molecules_to_lists_dicts(
        mol_file, merge_proto=False)
    mol_list_dict = {k: v for k, v in mol_list_dict.items() if len(v) > 1}

    logging.info("Picking random molecules.")
    mol_proto_num = {}
    for proto_name in mol_list_dict.keys():
        mol_name = proto_name.split("-")[0]
        if mol_name in mol_proto_num:
            mol_proto_num[mol_name] += 1
        else:
            mol_proto_num[mol_name] = 1
    proto_names, proto_nums = zip(*[(k, mol_proto_num[k.split("-")[0]])
                                    for k in mol_list_dict.keys()])
    proto_probs = 1. / np.asanyarray(proto_nums)
    proto_probs /= np.sum(proto_probs)

    random_proto_names = np.random.choice(proto_names, size=num_pairs,
                                          replace=False, p=proto_probs)

    with smart_open(out_sdf_file, "wb") as f:
        writer = rdkit.Chem.SDWriter(f)

        for i, proto_name in enumerate(sorted(random_proto_names)):
            mol1, mol2 = get_random_pairs(proto_name, sdf_dir)
            writer.write(mol1)
            writer.write(mol2)

            if i > 0 and i % 100 == 0:
                logging.info(i)
示例#3
0
def compute_fold_metrics(target_mol_array,
                         mask_file,
                         results_file,
                         thresh=None):
    """Compute metrics from fold at maximum F1-score or threshold."""
    logging.info("Loading mask file.")
    with smart_open(mask_file, "rb") as f:
        train_test_mask = pkl.load(f)
    test_mask = train_test_mask == 1
    del train_test_mask

    logging.info("Loading results from file.")
    with np.load(results_file) as data:
        results = data["results"]

    logging.info("Computing metrics.")
    y_true = target_mol_array[test_mask].ravel()
    y_score = results[test_mask].ravel()
    nan_inds = np.where(~np.isnan(y_score))
    y_true, y_score = y_true[nan_inds], y_score[nan_inds]
    del results, test_mask, target_mol_array

    if thresh is None:
        f1, thresh = get_max_f1_thresh(y_true, y_score)
    pvalue = 10**(-thresh)

    sensitivity, specificity, precision, f1 = get_metrics_at_thresh(
        y_true, y_score, thresh)
    logging.debug(("P-value: {:.4g}  Sensitivity: {:.4f}  "
                   "Specificity: {:.4f}  Precision: {:.4f}  "
                   "F1: {:.4f}").format(pvalue, sensitivity, specificity,
                                        precision, f1))

    return (pvalue, sensitivity, specificity, precision, f1)
def get_num_mols(mol_names_file):
    i = 0
    with smart_open(mol_names_file, 'rb') as f:
        for l in f:
            if len(l.rstrip()) > 0:
                i += 1
    return i
示例#5
0
    def save_fold_files(self, train_test_mask, mol_list, target_list,
                        smiles_dict, mol_list_dict, fp_type, target_dict):
        with smart_open(self.mask_file, "wb") as f:
            pkl.dump(train_test_mask, f, pkl.HIGHEST_PROTOCOL)

        if not isinstance(self.cv_method, SEASearchCVMethod):
            return

        test_molecules_file = os.path.join(self.out_dir,
                                           "test_molecules.csv.bz2")
        test_targets_file = os.path.join(self.out_dir, "test_targets.csv.bz2")
        train_molecules_file = os.path.join(self.out_dir,
                                            "train_molecules.csv.bz2")
        train_targets_file = os.path.join(self.out_dir,
                                          "train_targets.csv.bz2")

        if self.overwrite or not all(
                map(os.path.isfile,
                    (test_molecules_file, test_targets_file,
                     train_molecules_file, train_targets_file))):
            (train_mol_list_dict, train_target_dict, test_mol_list_dict,
             test_target_dict) = train_test_dicts_from_mask(
                 mol_list_dict, mol_list, target_dict, target_list,
                 train_test_mask)
            lists_dicts_to_molecules(test_molecules_file, smiles_dict,
                                     test_mol_list_dict, fp_type)
            lists_dicts_to_molecules(train_molecules_file, smiles_dict,
                                     train_mol_list_dict, fp_type)
            train_target_dict = targets_to_mol_lists_targets(
                train_target_dict, train_mol_list_dict)
            test_target_dict = targets_to_mol_lists_targets(
                test_target_dict, test_mol_list_dict)
            dict_to_targets(train_targets_file, train_target_dict)
            dict_to_targets(test_targets_file, test_target_dict)
示例#6
0
def mol_to_sdf(mol, out_file, conf_num=None):
    """Write RDKit `Mol` objects to an SDF file.

    Parameters
    ----------
    mol : RDKit Mol
        A molecule containing 1 or more conformations to write to file.
    out_file : str
        Path to save SDF file.
    conf_num : int or None, optional
        Maximum number of conformers to save to file. Defaults to all.
    """
    touch_dir(os.path.dirname(out_file))
    with smart_open(out_file, "w") as fobj:
        writer = rdkit.Chem.SDWriter(fobj)
        conf_ids = [conf.GetId() for conf in mol.GetConformers()]
        conf_energies = get_conformer_energies_from_mol(mol)
        mol.ClearProp(CONF_ENERGIES_PROPNAME)
        for i in conf_ids:
            if conf_num not in {-1, None} and i >= conf_num:
                break
            try:
                conf_energy = conf_energies[i]
                mol.SetProp(CONF_ENERGY_PROPNAME, "{:.4f}".format(conf_energy))
            except (IndexError, TypeError):
                pass
            writer.write(mol, confId=i)
        writer.close()
        mol.ClearProp(CONF_ENERGY_PROPNAME)
        if conf_energies is not None:
            add_conformer_energies_to_mol(mol, conf_energies)
    logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
示例#7
0
def smiles_generator(*filenames):
    """Parse SMILES file(s) and yield (name, smile).

    Parameters
    ----------
    files : iterable object
        List of files containing smiles. File must contain one smile per
        line, followed by a space and then the molecule name.

    Yields
    ------
    tuple:
        `tuple` of the format (smile, name).
    """
    for filename in filenames:
        with smart_open(filename, "r") as f:
            for i, line in enumerate(f):
                values = line.rstrip("\r\n").split()
                if len(values) >= 2:
                    yield tuple(values[:2])
                else:
                    logging.warning(
                        ("Line {:d} of {} has {:d} entries. Expected at least"
                         " 2.".format(i + 1, filename, len(values))),
                        exc_info=True,
                    )
示例#8
0
def results_from_fold_dir(fold_dir, basename="combined"):
    fns = glob.glob(os.path.join(fold_dir, "{}.*".format(basename)))
    results_list = []
    for fn in fns:
        logging.debug("Opening {}...".format(fn))
        with smart_open(fn, "rb") as f:
            results_list.append(pkl.load(f))
    return results_list
示例#9
0
 def load_fit_file(self, fit_file):
     """Load target fit from file."""
     with smart_open(fit_file, "rb") as f:
         fit = pkl.load(f)
     first_weights = fit['hidden'][0]
     clf = self.create_clf(data=first_weights.T)
     clf.load_params_from(fit_file)
     return clf
示例#10
0
def main(sdf_dir, mol_file, num_confs=10000,
         out_conf_file="random_conformers.txt",
         out_sdf_file="random_conformers.sdf.bz2",
         out_mol_file="random_conformers.csv.bz2"):
    confs = set()
    if os.path.isfile(out_mol_file):
        logging.info("Loading existing random molecules.")
        _, conf_mol_list_dict, _ = molecules_to_lists_dicts(out_mol_file,
                                                            merge_proto=False)
        for proto_name in conf_mol_list_dict:
            for _, conf_name in conf_mol_list_dict[proto_name]:
                confs.add(split_conf_name(conf_name))
    else:
        logging.info("Loading molecules file.")
        smiles_dict, mol_list_dict, fp_type = molecules_to_lists_dicts(
            mol_file, merge_proto=False)
        mol_name_to_proto_names = {}
        for proto_name in mol_list_dict:
            mol_name, _ = split_conf_name(proto_name)
            mol_name_to_proto_names.setdefault(mol_name, []).append(proto_name)
        conf_mol_list_dict = {}
        logging.info("Picking random molecules.")
        while len(confs) < num_confs:
            mol_name = random.choice(mol_name_to_proto_names.keys())
            proto_name = random.choice(mol_name_to_proto_names[mol_name])
            _, conf_name = random.choice(mol_list_dict[proto_name])
            conf = split_conf_name(conf_name)
            confs.add(conf)
            conf_mol_list_dict.setdefault(proto_name, set()).add(
                mol_list_dict[proto_name][conf[2]])
            if len(confs) % 100 == 0:
                logging.info(len(confs))
        conf_mol_list_dict = {k: sorted(v) for k, v
                              in conf_mol_list_dict.items()}
        lists_dicts_to_molecules(out_mol_file, smiles_dict, conf_mol_list_dict,
                                 fp_type)
    confs = sorted(confs)

    logging.info("Writing mol names to file.")
    with open(out_conf_file, "w") as f:
        for conf in confs:
            f.write("{}\n".format(join_conf_name(*conf)))

    logging.info("Saving mols to SDF file.")
    with smart_open(out_sdf_file, "wb") as f:
        writer = rdkit.Chem.SDWriter(f)
        for j, conf in enumerate(confs):
            mol_name, proto_id, conf_id = conf
            sdf_file = glob.glob(os.path.join(
                sdf_dir, "{}.sdf*".format(
                    join_conf_name(mol_name, proto_id))))[0]
            mol = mol_from_sdf(sdf_file, conf_num=conf_id + 1)
            name = join_conf_name(*conf)
            mol.SetProp("_Name", name)
            writer.write(mol, confId=conf_id)
            if j > 0 and j % 10 == 0:
                logging.info(j)
        writer.close()
示例#11
0
 def save_fit_file(self, target_key, clf):
     """Save target fit to file."""
     try:
         fit_file = self._fit_file_from_target_key(target_key)
     except:  # assume target_key is a fit file.
         fit_file = target_key
     with smart_open(fit_file, "w") as f:
         pkl.dump(clf, f)
     return fit_file
示例#12
0
def main(molecules_file,
         library_file,
         target_results_pickle=TARGET_RESULTS_PICKLE_DEF,
         mol_results_pickle=MOL_RESULTS_PICKLE_DEF,
         log_file=None,
         verbose=False):
    setup_logging(log_file, verbose=verbose)
    logging.info("Loading molecules file")
    smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
        molecules_file)
    del smiles_dict, fp_type
    logging.info("Running SEA searches with {:d} molecules.".format(
        len(mol_lists_dict)))
    set_searcher = sea_set_search(library_file, mol_lists_dict)
    logging.info("Saving results to pickles.")
    with smart_open(target_results_pickle, "wb") as f:
        pickle.dump(set_searcher.target_results_dict, f)
    with smart_open(mol_results_pickle, "wb") as f:
        pickle.dump(set_searcher.set_results_dict, f)
示例#13
0
def main(mfile1, mfile2, name1, name2, out_file, precision=PRECISION,
         log_freq=LOG_FREQ, num_proc=None, parallel_mode=None):
    setup_logging()
    if not out_file:
        out_file = (name1.lower().replace('\s', '_') + "_" +
                    name2.lower().replace('\s', '_') + "_tcs.csv.gz")

    # Load files
    mmap1 = load_mmap(mfile1)
    mmap2 = load_mmap(mfile2)
    if mmap1.shape != mmap2.shape:
        raise ValueError(
            "Memmaps do not have the same shape: {} {}".format(
                mmap1.shape, mmap2.shape))

    # Count binned pairs
    pair_num = mmap1.shape[0]
    del mmap1, mmap2

    para = Parallelizer(parallel_mode=parallel_mode, num_proc=num_proc)
    num_proc = max(para.num_proc - 1, 1)
    chunk_bounds = np.linspace(-1, pair_num - 1, num_proc + 1, dtype=int)
    chunk_bounds = list(zip(chunk_bounds[:-1] + 1, chunk_bounds[1:]))
    logging.info("Divided into {} chunks with ranges: {}".format(num_proc,
                                                                 chunk_bounds))

    logging.info("Counting TCs in chunks.")
    kwargs = {"mfile1": mfile1, "mfile2": mfile2, "precision": precision,
              "log_freq": log_freq}
    results_iter = para.run_gen(count_tcs, chunk_bounds, kwargs=kwargs)
    tc_pair_counts = Counter()
    for chunk_counts, _ in results_iter:
        if not isinstance(chunk_counts, dict):
            logging.error("Results are not in dict form.")
            continue
        tc_pair_counts.update(chunk_counts)

    # Write pairs to file
    logging.info("Writing binned pairs to {}.".format(out_file))
    mult = 10**precision
    with smart_open(out_file, "wb") as f:
        writer = csv.writer(f, delimiter=SEP)
        writer.writerow([name1, name2, "Count"])
        for pair in sorted(tc_pair_counts):
            writer.writerow([round(pair[0] / mult, precision),
                             round(pair[1] / mult, precision),
                             tc_pair_counts[pair]])

    total_counts = sum(tc_pair_counts.values())
    if total_counts != pair_num:
        logging.warning(
            "Pair counts {} did not match expected number {}".format(
                total_counts, pair_num))
        return
    logging.info("Completed.")
示例#14
0
def mol_from_sdf(sdf_file, conf_num=None, standardise=False):
    """Read SDF file into an RDKit `Mol` object.

    Parameters
    ----------
    sdf_file : str
        Path to an SDF file
    conf_num : int or None, optional
        Maximum number of conformers to read from file. Defaults to all.
    standardise : bool (default False)
        Clean mol through standardisation

    Returns
    -------
    RDKit Mol : `Mol` object with each molecule in SDF file as a conformer
    """
    mol = None
    conf_energies = []
    with smart_open(sdf_file, "r") as f:
        supplier = rdkit.Chem.ForwardSDMolSupplier(f)
        i = 0
        while True:
            if i == conf_num:
                break
            try:
                new_mol = next(supplier)
            except StopIteration:
                logging.debug("Read {:d} conformers from {}.".format(
                    i, sdf_file))
                break

            if new_mol.HasProp(CONF_ENERGY_PROPNAME):
                conf_energies.append(
                    float(new_mol.GetProp(CONF_ENERGY_PROPNAME)))

            if mol is None:
                mol = rdkit.Chem.Mol(new_mol)
                mol.RemoveAllConformers()
            conf = new_mol.GetConformers()[0]
            mol.AddConformer(conf, assignId=True)
            i += 1
    if standardise:
        mol = mol_to_standardised_mol(mol)
    try:
        mol.GetProp("_Name")
    except KeyError:
        name = os.path.basename(sdf_file).split(".sdf")[0]
        mol.SetProp("_Name", name)

    if len(conf_energies) > 0:
        add_conformer_energies_to_mol(mol, conf_energies)
        mol.ClearProp(CONF_ENERGY_PROPNAME)

    return mol
示例#15
0
    def savetxt(self, fn, with_names=True):
        """Save bitstring representation to text file.

        Only implemented for `fp_type` of `Fingerprint`. This should not be
        attempted for large numbers of bits.

        Parameters
        ----------
        fn : str or filehandle
            Out file. Extension is automatically parsed to determine whether
            compression is used.
        with_names : bool, optional
            Include name of fingerprint in same row after bitstring.

        Raises
        ------
        E3FPInvalidFingerprintError
            If `fp_type` is not `Fingerprint`.
        E3FPEfficiencyWarning
            If `bits` is over 2^14 = 16384.
        """
        if self.fp_type is not Fingerprint:
            raise E3FPInvalidFingerprintError(
                "Only binary `Fingerprint` databases may be saved to "
                "bitstrings."
            )

        if self.bits > 2 ** 14:
            warnings.warn(
                (
                    "Saving sparse bitstrings to text file is highly "
                    "inefficient for large bit lengths"
                ),
                category=E3FPEfficiencyWarning,
                stacklevel=2,
            )

        row_fmt = "{0:s}"
        if with_names:
            row_fmt += " {1:s}"

        with smart_open(fn, "w") as f:
            for i in range(self.fp_num):
                # Much more efficient to access underlying arrays
                indices = self.array.indices[
                    self.array.indptr[i] : self.array.indptr[i + 1]
                ]
                bs = "1".join(
                    [
                        "0" * j
                        for j in np.diff(np.r_[-1, indices, self.bits]) - 1
                    ]
                )
                f.write(row_fmt.format(bs, self.fp_names[i]) + "\n")
示例#16
0
def params_to_molecules(params, smiles_file, conf_dir, out_dir,
                        parallelizer=None):
    """Generate molecules_file based on params dict."""
    smiles_dict = smiles_to_dict(smiles_file)
    logging.debug("SMILES file has {:d} unique smiles.".format(
        len(smiles_dict)))
    logging.debug("Example SMILES: {!r}".format(smiles_dict.items()[0]))
    fprint_params = {"radius_multiplier": params["radius_multiplier"],
                     "stereo": STEREO, "bits": params["bits"],
                     "first": params['first'], "level": params['level']}

    conf_dir_files = glob.glob("{!s}/*".format(conf_dir))
    logging.debug("Found {:d} files in conformer directory.".format(
        len(conf_dir_files)))
    sdf_files = [x for x in conf_dir_files
                 if os.path.basename(x).split('.')[0] in smiles_dict]
    logging.debug("{:d} conformer files match SMILES.".format(len(sdf_files)))

    if len(sdf_files) == 0:
        raise Exception("Directory {!s} does not contain any usable SDF "
                        "files.".format(conf_dir))

    kwargs = {"save": False, "fprint_params": fprint_params}

    data_iterator = make_data_iterator(sdf_files)
    if parallelizer is not None:
        results_iter = parallelizer.run_gen(native_tuples_from_sdf,
                                            data_iterator, kwargs=kwargs)
    else:
        results_iter = (native_tuples_from_sdf(*x, **kwargs)
                        for x in data_iterator)

    molecules_file = get_molecules_file(out_dir)
    fp_type = fprint_params_to_fptype(**params)
    with smart_open(molecules_file, "wb") as f:
        writer = csv.writer(f)
        fp_type.write(writer)
        writer.writerow(("molecule id", "smiles", "fingerprint"))
        for results in results_iter:
            try:
                fp_native_list, sdf_file = results
            except ValueError:
                logging.error("Results of fingerprinting did not look as "
                              "expected: {!r}".format(results))
            proto_name = MolItemName.from_str(fp_native_list[0][1]).proto_name
            smiles = smiles_dict[proto_name]
            for fp_native, fp_name in fp_native_list:
                writer.writerow((fp_name, smiles, fp_native))

    del smiles_dict
    filtered_smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
        molecules_file)
    return (filtered_smiles_dict, mol_lists_dict, fp_type)
示例#17
0
    def save(self, fn="fingerprints.fps.bz2"):
        """Save database to file.

        Parameters
        ----------
        fn : str, optional
            Filename or basename if extension does not include '.fps'
        """
        if ".fps" not in fn:
            fn += ".fps.bz2"
        with smart_open(fn, "w") as f:
            pkl.dump(self, f)
示例#18
0
def lists_dicts_to_molecules(molecules_file, smiles_dict, mol_lists_dict,
                             fp_type):
    """Write dict of mol names to list of native tuples to a molecules file."""
    with smart_open(molecules_file, "wb") as f:
        writer = csv.writer(f)
        fp_type.write(writer)
        writer.writerow(("molecule id", "smiles", "fingerprint"))
        for mol_name in sorted(smiles_dict.keys()):
            smiles = smiles_dict[mol_name]
            fp_list = mol_lists_dict.get(mol_name, [])
            for fp_native, fp_name in fp_list:
                writer.writerow((fp_name, smiles, fp_native))
示例#19
0
文件: fprint.py 项目: zeromtmu/e3fp
def _save(f, *fps, **kwargs):
    default_dict = {'protocol': None}
    default_dict.update(kwargs)
    protocol = default_dict["protocol"]

    with smart_open(f, "w") as fh:
        if protocol is None:
            protocol = pkl.HIGHEST_PROTOCOL

        for fp in fps:
            pkl.dump(fp, fh, protocol)

    return True
示例#20
0
    def load(cls, fn):
        """Load database from file.

        The extension is used to determine how database was serialized
        (`save` vs `savez`).

        Parameters
        ----------
        fn : str
            Filename

        Returns
        -------
        FingerprintDatabase
            Database
        """
        if fn.endswith(".fpz"):
            if scipy.__version__ < "1.0":
                warnings.warn(
                    (
                        "Use SciPy 1.0 or newer to efficiently load large "
                        "FingerprintDatabases."
                    ),
                    category=E3FPEfficiencyWarning,
                    stacklevel=2,
                )
            array_dict = dict(np.load(fn, allow_pickle=True).items())
            props_dict = {}
            for k in list(array_dict.keys()):
                if k.startswith("_"):
                    v = array_dict.pop(k)
                    props_dict[k[1:]] = v
            array = csr_matrix(
                (
                    array_dict["data"],
                    array_dict["indices"],
                    array_dict["indptr"],
                ),
                shape=array_dict["shape"],
            )
            return FingerprintDatabase.from_array(
                array,
                array_dict["fp_names"],
                fp_type=array_dict["fp_type"].item(),
                level=array_dict["level"].item(),
                name=array_dict["name"].item(),
                props=props_dict,
            )
        else:
            with smart_open(fn) as f:
                return pkl.load(f)
示例#21
0
 def process_fingerprints(self, fprint_dict):
     new_fprint_dict = {}
     if self.mode == "union":
         for mol_name, fprints in fprint_dict.iteritems():
             new_fprint_dict[mol_name] = [
                 fp.Fingerprint.from_fingerprint(fp.add(fprints))
             ]
             new_fprint_dict[mol_name][0].name = mol_name
     elif self.mode == "mean":
         for mol_name, fprints in fprint_dict.iteritems():
             new_fprint_dict[mol_name] = [fp.mean(fprints)]
             new_fprint_dict[mol_name][0].name = mol_name
     elif self.mode == "mean-boltzmann":
         energies_dict = {}
         with smart_open(self.energies_file, "r") as f:
             for line in f:
                 name, energy = line.rstrip().split('\t')
                 energies_dict[name] = float(energy)
         for mol_name, fprints in fprint_dict.iteritems():
             energies = np.array(
                 [energies_dict[fprint.name] for fprint in fprints])
             # factor out max term to reduce overflow
             e_min = energies.min()
             adjusted_energies = energies - e_min
             probs = np.exp(-adjusted_energies / KT)
             prob_sum = np.sum(probs)
             if prob_sum == 0.:
                 logging.warning(
                     ("Boltzmann probabilities for {} sum to 0. Using "
                      "unweighted mean.").format(mol_name))
                 new_fprint_dict[mol_name] = [fp.mean(fprints)]
             else:
                 if prob_sum == 1. and probs.shape[0] > 1:
                     logging.info(
                         ("Boltzmann probabilities for {} dominated by 1 "
                          "term.").format(mol_name))
                 new_fprint_dict[mol_name] = [
                     fp.mean(fprints, weights=probs)
                 ]
             new_fprint_dict[mol_name][0].name = mol_name
     elif self.mode == "first":
         new_fprint_dict = {}
         for mol_name, fprints in fprint_dict.iteritems():
             new_fprint_dict[mol_name] = []
             for proto_name, proto_fprints in itertools.groupby(
                     fprints, key=lambda x: x.name.split('_')[0]):
                 first_fprint = copy.deepcopy(list(proto_fprints)[0])
                 first_fprint.name = proto_name
                 new_fprint_dict[mol_name].append(first_fprint)
     return new_fprint_dict
示例#22
0
文件: db.py 项目: amrhamedp/e3fp
    def load(cls, fn):
        """Load database from file.

        Parameters
        ----------
        fn : str
            Filename

        Returns
        -------
        FingerprintDatabase
            Dabatase
        """
        with smart_open(fn) as f:
            return pkl.load(f)
示例#23
0
def prc_roc_aucs_from_cv_dirs(cv_dirs):
    aucs_list = []
    for cv_dir in cv_dirs:
        log_file = glob.glob(os.path.join(cv_dir, "log.txt"))[0]
        with smart_open(log_file, "r") as f:
            for line in f:
                try:
                    m = re.search('Fold.*AUROC of (0\.\d+).*AUPRC of (0\.\d+)',
                                  line)
                    aucs = float(m.group(1)), float(m.group(2))
                    aucs_list.append(aucs)
                except AttributeError:
                    continue
    aurocs, auprcs = zip(*aucs_list)
    return aurocs, auprcs
示例#24
0
 def load(self):
     """Load memmap file and entry names file."""
     self.array = np.memmap(self.memmap_file, mode="r", dtype=self.dtype)
     self.entry_names = []
     with smart_open(self.entry_names_file, "r") as f:
         for line in f:
             line = line.rstrip()
             if len(line) > 0:
                 self.entry_names.append(line)
     size = len(self.entry_names)
     if self._get_tril_index_from_indices(
             size - 1, size - 2) != self.array.shape[0] - 1:
         raise ValueError(("Number of items in memmap does not match "
                           "number of row names."))
     self.shape = (size, size)
     self.update_name_to_index_map()
示例#25
0
文件: fprint.py 项目: zeromtmu/e3fp
def _load(f, update_structure=True):
    fps = []
    with smart_open(f, "r") as fh:
        try:
            while True:
                fp = pkl.load(fh)
                if update_structure:
                    try:
                        fps.append(fp.__class__.from_fingerprint(fp))
                    except AttributeError:
                        fps.append(fp)
                else:
                    fps.append(fp)
        except EOFError:
            pass

    return fps
示例#26
0
def native_tuples_to_molecules(molecules_file, native_tuples_lists_iter,
                               smiles_dict, fp_type):
    """Given an iterable of native tuples lists, write to molecules file."""
    with smart_open(molecules_file, "wb") as f:
        writer = csv.writer(f)
        fp_type.write(writer)
        writer.writerow(("molecule id", "smiles", "fingerprint"))
        for i, native_tuples_list in enumerate(native_tuples_lists_iter):
            logging.debug(
                "Wrote native strings for molecule {:d} to molecules file.".
                format(i + 1))
            # smiles = smiles_dict[mol_name]
            for fp_native, fp_name in native_tuples_list:
                mol_item_name = MolItemName.from_str(native_tuples_list[0][1])
                smiles = smiles_dict.get(
                    mol_item_name.proto_name,
                    smiles_dict.get(mol_item_name.mol_name))
                writer.writerow((fp_name, smiles, fp_native))
示例#27
0
def main(sdf_dir, out_sdf_file, first=3):
    sdf_files = glob.glob(os.path.join(sdf_dir, "*sdf*"))
    sdf_files = sorted(sdf_files, key=mol_conf_id_from_fn)

    with smart_open(out_sdf_file, "wb") as fobj:
        writer = rdkit.Chem.SDWriter(fobj)
        for j, sdf_file in enumerate(sdf_files):
            mol = mol_from_sdf(sdf_file, conf_num=FIRST + 1)
            proto_name = mol.GetProp("_Name")
            mol_name, _ = mol_conf_id_from_fn(proto_name)
            mol.SetProp("_Name", mol_name)
            conf_ids = [conf.GetId() for conf in mol.GetConformers()]
            for i in conf_ids:
                if i >= first and first not in (-1, None):
                    break
                writer.write(mol, confId=i)
            if j > 0 and j % 100 == 0:
                print(j)
        writer.close()
示例#28
0
def target_aucs_from_cv_dirs(cv_dirs):
    target_aurocs_dict = {}
    target_auprcs_dict = {}
    if isinstance(cv_dirs, str):
        cv_dirs = [cv_dirs]
    for cv_dir in cv_dirs:
        log_file = glob.glob(os.path.join(cv_dir, "log.txt"))[0]
        with smart_open(log_file, "r") as f:
            for line in f:
                try:
                    m = re.search(
                        'Target ([\w\d]+) .*AUROC of (0\.\d+).*AUPRC of (0\.\d+)',
                        line)
                    tid = m.group(1)
                    aucs = float(m.group(2)), float(m.group(3))
                    target_aurocs_dict.setdefault(tid, [])
                    target_auprcs_dict.setdefault(tid, [])
                    target_aurocs_dict[tid].append(aucs[0])
                    target_auprcs_dict[tid].append(aucs[1])
                except AttributeError:
                    continue
    return target_aurocs_dict, target_auprcs_dict
示例#29
0
文件: util.py 项目: RhDm/e3fp
def mol_to_sdf(mol, out_file, conf_num=None):
    """Write RDKit ``Mol`` objects to an SDF file.

    Parameters
    ----------
    mol : RDKit Mol
        A molecule containing 1 or more conformations to write to file.
    out_file : str
        Path to save SDF file.
    conf_num : int or None, optional
        Maximum number of conformers to save to file. Defaults to all.
    """
    touch_dir(os.path.dirname(out_file))
    with smart_open(out_file, "wb") as fobj:
        writer = rdkit.Chem.SDWriter(fobj)
        conf_ids = [conf.GetId() for conf in mol.GetConformers()]
        for i in conf_ids:
            if conf_num not in {-1, None} and i >= conf_num:
                break
            writer.write(mol, confId=i)
        writer.close()
    logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
示例#30
0
def compute_average_metrics(cv_dir, thresh=None):
    """Compute fold metrics averaged across fold."""
    input_file = os.path.join(cv_dir, "inputs.pkl.bz2")
    fold_dirs = glob.glob(os.path.join(cv_dir, "*/"))

    logging.debug("Loading input files.")
    with smart_open(input_file, "rb") as f:
        (fp_array, mol_to_fp_inds, target_mol_array, target_list,
         mol_list) = pkl.load(f)
    del fp_array, mol_to_fp_inds, target_list, mol_list

    if issparse(target_mol_array):
        target_mol_array = target_mol_array.toarray().astype(np.bool)

    fold_metrics = []
    for fold_dir in sorted(fold_dirs):
        mask_file = glob.glob(os.path.join(fold_dir, "*mask*"))[0]
        results_file = glob.glob(os.path.join(fold_dir, "*result*"))[0]
        fold_metric = compute_fold_metrics(target_mol_array,
                                           mask_file,
                                           results_file,
                                           thresh=thresh)
        fold_metrics.append(fold_metric)

    fold_metrics = np.asarray(fold_metrics)
    mean_metrics = fold_metrics.mean(axis=0)
    std_metrics = fold_metrics.std(axis=0)
    logging.debug(
        ("P-value: {:.4g} +/- {:.4g}  "
         "Sensitivity: {:.4f} +/- {:.4f}  "
         "Specificity: {:.4f} +/- {:.4f}  "
         "Precision: {:.4f}  +/- {:.4f}  "
         "F1: {:.4f} +/- {:.4f}").format(mean_metrics[0], std_metrics[0],
                                         mean_metrics[1], std_metrics[1],
                                         mean_metrics[2], std_metrics[2],
                                         mean_metrics[3], std_metrics[3],
                                         mean_metrics[4], std_metrics[4]))
    return mean_metrics