def test_save_txt(self): """Ensure bitstrings saved to txt correctly.""" from e3fp.fingerprint.db import FingerprintDatabase from python_utilities.io_tools import smart_open array = np.array( [[1, 0, 0, 1, 1], [0, 0, 0, 1, 0], [0, 1, 1, 1, 1]], dtype=np.bool_ ) db = FingerprintDatabase.from_array(array, ["1", "2", "3"]) desc, txt_file = tempfile.mkstemp(suffix=".txt.gz") os.close(desc) db.savetxt(txt_file) exp_bitstring = b"10011 1\n00010 2\n01111 3\n" with smart_open(txt_file, "r") as f: bitstring = f.read() self.assertEqual(bitstring, exp_bitstring) os.unlink(txt_file) desc, txt_file = tempfile.mkstemp(suffix=".txt.gz") os.close(desc) db.savetxt(txt_file, with_names=False) exp_bitstring = b"10011\n00010\n01111\n" with smart_open(txt_file, "r") as f: bitstring = f.read() self.assertEqual(bitstring, exp_bitstring) os.unlink(txt_file)
def main(sdf_dir, mol_file, num_pairs=10000, out_sdf_file="random_pairs.sdf.bz2"): logging.info("Loading molecules file.") smiles_dict, mol_list_dict, fp_type = molecules_to_lists_dicts( mol_file, merge_proto=False) mol_list_dict = {k: v for k, v in mol_list_dict.items() if len(v) > 1} logging.info("Picking random molecules.") mol_proto_num = {} for proto_name in mol_list_dict.keys(): mol_name = proto_name.split("-")[0] if mol_name in mol_proto_num: mol_proto_num[mol_name] += 1 else: mol_proto_num[mol_name] = 1 proto_names, proto_nums = zip(*[(k, mol_proto_num[k.split("-")[0]]) for k in mol_list_dict.keys()]) proto_probs = 1. / np.asanyarray(proto_nums) proto_probs /= np.sum(proto_probs) random_proto_names = np.random.choice(proto_names, size=num_pairs, replace=False, p=proto_probs) with smart_open(out_sdf_file, "wb") as f: writer = rdkit.Chem.SDWriter(f) for i, proto_name in enumerate(sorted(random_proto_names)): mol1, mol2 = get_random_pairs(proto_name, sdf_dir) writer.write(mol1) writer.write(mol2) if i > 0 and i % 100 == 0: logging.info(i)
def compute_fold_metrics(target_mol_array, mask_file, results_file, thresh=None): """Compute metrics from fold at maximum F1-score or threshold.""" logging.info("Loading mask file.") with smart_open(mask_file, "rb") as f: train_test_mask = pkl.load(f) test_mask = train_test_mask == 1 del train_test_mask logging.info("Loading results from file.") with np.load(results_file) as data: results = data["results"] logging.info("Computing metrics.") y_true = target_mol_array[test_mask].ravel() y_score = results[test_mask].ravel() nan_inds = np.where(~np.isnan(y_score)) y_true, y_score = y_true[nan_inds], y_score[nan_inds] del results, test_mask, target_mol_array if thresh is None: f1, thresh = get_max_f1_thresh(y_true, y_score) pvalue = 10**(-thresh) sensitivity, specificity, precision, f1 = get_metrics_at_thresh( y_true, y_score, thresh) logging.debug(("P-value: {:.4g} Sensitivity: {:.4f} " "Specificity: {:.4f} Precision: {:.4f} " "F1: {:.4f}").format(pvalue, sensitivity, specificity, precision, f1)) return (pvalue, sensitivity, specificity, precision, f1)
def get_num_mols(mol_names_file): i = 0 with smart_open(mol_names_file, 'rb') as f: for l in f: if len(l.rstrip()) > 0: i += 1 return i
def save_fold_files(self, train_test_mask, mol_list, target_list, smiles_dict, mol_list_dict, fp_type, target_dict): with smart_open(self.mask_file, "wb") as f: pkl.dump(train_test_mask, f, pkl.HIGHEST_PROTOCOL) if not isinstance(self.cv_method, SEASearchCVMethod): return test_molecules_file = os.path.join(self.out_dir, "test_molecules.csv.bz2") test_targets_file = os.path.join(self.out_dir, "test_targets.csv.bz2") train_molecules_file = os.path.join(self.out_dir, "train_molecules.csv.bz2") train_targets_file = os.path.join(self.out_dir, "train_targets.csv.bz2") if self.overwrite or not all( map(os.path.isfile, (test_molecules_file, test_targets_file, train_molecules_file, train_targets_file))): (train_mol_list_dict, train_target_dict, test_mol_list_dict, test_target_dict) = train_test_dicts_from_mask( mol_list_dict, mol_list, target_dict, target_list, train_test_mask) lists_dicts_to_molecules(test_molecules_file, smiles_dict, test_mol_list_dict, fp_type) lists_dicts_to_molecules(train_molecules_file, smiles_dict, train_mol_list_dict, fp_type) train_target_dict = targets_to_mol_lists_targets( train_target_dict, train_mol_list_dict) test_target_dict = targets_to_mol_lists_targets( test_target_dict, test_mol_list_dict) dict_to_targets(train_targets_file, train_target_dict) dict_to_targets(test_targets_file, test_target_dict)
def mol_to_sdf(mol, out_file, conf_num=None): """Write RDKit `Mol` objects to an SDF file. Parameters ---------- mol : RDKit Mol A molecule containing 1 or more conformations to write to file. out_file : str Path to save SDF file. conf_num : int or None, optional Maximum number of conformers to save to file. Defaults to all. """ touch_dir(os.path.dirname(out_file)) with smart_open(out_file, "w") as fobj: writer = rdkit.Chem.SDWriter(fobj) conf_ids = [conf.GetId() for conf in mol.GetConformers()] conf_energies = get_conformer_energies_from_mol(mol) mol.ClearProp(CONF_ENERGIES_PROPNAME) for i in conf_ids: if conf_num not in {-1, None} and i >= conf_num: break try: conf_energy = conf_energies[i] mol.SetProp(CONF_ENERGY_PROPNAME, "{:.4f}".format(conf_energy)) except (IndexError, TypeError): pass writer.write(mol, confId=i) writer.close() mol.ClearProp(CONF_ENERGY_PROPNAME) if conf_energies is not None: add_conformer_energies_to_mol(mol, conf_energies) logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
def smiles_generator(*filenames): """Parse SMILES file(s) and yield (name, smile). Parameters ---------- files : iterable object List of files containing smiles. File must contain one smile per line, followed by a space and then the molecule name. Yields ------ tuple: `tuple` of the format (smile, name). """ for filename in filenames: with smart_open(filename, "r") as f: for i, line in enumerate(f): values = line.rstrip("\r\n").split() if len(values) >= 2: yield tuple(values[:2]) else: logging.warning( ("Line {:d} of {} has {:d} entries. Expected at least" " 2.".format(i + 1, filename, len(values))), exc_info=True, )
def results_from_fold_dir(fold_dir, basename="combined"): fns = glob.glob(os.path.join(fold_dir, "{}.*".format(basename))) results_list = [] for fn in fns: logging.debug("Opening {}...".format(fn)) with smart_open(fn, "rb") as f: results_list.append(pkl.load(f)) return results_list
def load_fit_file(self, fit_file): """Load target fit from file.""" with smart_open(fit_file, "rb") as f: fit = pkl.load(f) first_weights = fit['hidden'][0] clf = self.create_clf(data=first_weights.T) clf.load_params_from(fit_file) return clf
def main(sdf_dir, mol_file, num_confs=10000, out_conf_file="random_conformers.txt", out_sdf_file="random_conformers.sdf.bz2", out_mol_file="random_conformers.csv.bz2"): confs = set() if os.path.isfile(out_mol_file): logging.info("Loading existing random molecules.") _, conf_mol_list_dict, _ = molecules_to_lists_dicts(out_mol_file, merge_proto=False) for proto_name in conf_mol_list_dict: for _, conf_name in conf_mol_list_dict[proto_name]: confs.add(split_conf_name(conf_name)) else: logging.info("Loading molecules file.") smiles_dict, mol_list_dict, fp_type = molecules_to_lists_dicts( mol_file, merge_proto=False) mol_name_to_proto_names = {} for proto_name in mol_list_dict: mol_name, _ = split_conf_name(proto_name) mol_name_to_proto_names.setdefault(mol_name, []).append(proto_name) conf_mol_list_dict = {} logging.info("Picking random molecules.") while len(confs) < num_confs: mol_name = random.choice(mol_name_to_proto_names.keys()) proto_name = random.choice(mol_name_to_proto_names[mol_name]) _, conf_name = random.choice(mol_list_dict[proto_name]) conf = split_conf_name(conf_name) confs.add(conf) conf_mol_list_dict.setdefault(proto_name, set()).add( mol_list_dict[proto_name][conf[2]]) if len(confs) % 100 == 0: logging.info(len(confs)) conf_mol_list_dict = {k: sorted(v) for k, v in conf_mol_list_dict.items()} lists_dicts_to_molecules(out_mol_file, smiles_dict, conf_mol_list_dict, fp_type) confs = sorted(confs) logging.info("Writing mol names to file.") with open(out_conf_file, "w") as f: for conf in confs: f.write("{}\n".format(join_conf_name(*conf))) logging.info("Saving mols to SDF file.") with smart_open(out_sdf_file, "wb") as f: writer = rdkit.Chem.SDWriter(f) for j, conf in enumerate(confs): mol_name, proto_id, conf_id = conf sdf_file = glob.glob(os.path.join( sdf_dir, "{}.sdf*".format( join_conf_name(mol_name, proto_id))))[0] mol = mol_from_sdf(sdf_file, conf_num=conf_id + 1) name = join_conf_name(*conf) mol.SetProp("_Name", name) writer.write(mol, confId=conf_id) if j > 0 and j % 10 == 0: logging.info(j) writer.close()
def save_fit_file(self, target_key, clf): """Save target fit to file.""" try: fit_file = self._fit_file_from_target_key(target_key) except: # assume target_key is a fit file. fit_file = target_key with smart_open(fit_file, "w") as f: pkl.dump(clf, f) return fit_file
def main(molecules_file, library_file, target_results_pickle=TARGET_RESULTS_PICKLE_DEF, mol_results_pickle=MOL_RESULTS_PICKLE_DEF, log_file=None, verbose=False): setup_logging(log_file, verbose=verbose) logging.info("Loading molecules file") smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts( molecules_file) del smiles_dict, fp_type logging.info("Running SEA searches with {:d} molecules.".format( len(mol_lists_dict))) set_searcher = sea_set_search(library_file, mol_lists_dict) logging.info("Saving results to pickles.") with smart_open(target_results_pickle, "wb") as f: pickle.dump(set_searcher.target_results_dict, f) with smart_open(mol_results_pickle, "wb") as f: pickle.dump(set_searcher.set_results_dict, f)
def main(mfile1, mfile2, name1, name2, out_file, precision=PRECISION, log_freq=LOG_FREQ, num_proc=None, parallel_mode=None): setup_logging() if not out_file: out_file = (name1.lower().replace('\s', '_') + "_" + name2.lower().replace('\s', '_') + "_tcs.csv.gz") # Load files mmap1 = load_mmap(mfile1) mmap2 = load_mmap(mfile2) if mmap1.shape != mmap2.shape: raise ValueError( "Memmaps do not have the same shape: {} {}".format( mmap1.shape, mmap2.shape)) # Count binned pairs pair_num = mmap1.shape[0] del mmap1, mmap2 para = Parallelizer(parallel_mode=parallel_mode, num_proc=num_proc) num_proc = max(para.num_proc - 1, 1) chunk_bounds = np.linspace(-1, pair_num - 1, num_proc + 1, dtype=int) chunk_bounds = list(zip(chunk_bounds[:-1] + 1, chunk_bounds[1:])) logging.info("Divided into {} chunks with ranges: {}".format(num_proc, chunk_bounds)) logging.info("Counting TCs in chunks.") kwargs = {"mfile1": mfile1, "mfile2": mfile2, "precision": precision, "log_freq": log_freq} results_iter = para.run_gen(count_tcs, chunk_bounds, kwargs=kwargs) tc_pair_counts = Counter() for chunk_counts, _ in results_iter: if not isinstance(chunk_counts, dict): logging.error("Results are not in dict form.") continue tc_pair_counts.update(chunk_counts) # Write pairs to file logging.info("Writing binned pairs to {}.".format(out_file)) mult = 10**precision with smart_open(out_file, "wb") as f: writer = csv.writer(f, delimiter=SEP) writer.writerow([name1, name2, "Count"]) for pair in sorted(tc_pair_counts): writer.writerow([round(pair[0] / mult, precision), round(pair[1] / mult, precision), tc_pair_counts[pair]]) total_counts = sum(tc_pair_counts.values()) if total_counts != pair_num: logging.warning( "Pair counts {} did not match expected number {}".format( total_counts, pair_num)) return logging.info("Completed.")
def mol_from_sdf(sdf_file, conf_num=None, standardise=False): """Read SDF file into an RDKit `Mol` object. Parameters ---------- sdf_file : str Path to an SDF file conf_num : int or None, optional Maximum number of conformers to read from file. Defaults to all. standardise : bool (default False) Clean mol through standardisation Returns ------- RDKit Mol : `Mol` object with each molecule in SDF file as a conformer """ mol = None conf_energies = [] with smart_open(sdf_file, "r") as f: supplier = rdkit.Chem.ForwardSDMolSupplier(f) i = 0 while True: if i == conf_num: break try: new_mol = next(supplier) except StopIteration: logging.debug("Read {:d} conformers from {}.".format( i, sdf_file)) break if new_mol.HasProp(CONF_ENERGY_PROPNAME): conf_energies.append( float(new_mol.GetProp(CONF_ENERGY_PROPNAME))) if mol is None: mol = rdkit.Chem.Mol(new_mol) mol.RemoveAllConformers() conf = new_mol.GetConformers()[0] mol.AddConformer(conf, assignId=True) i += 1 if standardise: mol = mol_to_standardised_mol(mol) try: mol.GetProp("_Name") except KeyError: name = os.path.basename(sdf_file).split(".sdf")[0] mol.SetProp("_Name", name) if len(conf_energies) > 0: add_conformer_energies_to_mol(mol, conf_energies) mol.ClearProp(CONF_ENERGY_PROPNAME) return mol
def savetxt(self, fn, with_names=True): """Save bitstring representation to text file. Only implemented for `fp_type` of `Fingerprint`. This should not be attempted for large numbers of bits. Parameters ---------- fn : str or filehandle Out file. Extension is automatically parsed to determine whether compression is used. with_names : bool, optional Include name of fingerprint in same row after bitstring. Raises ------ E3FPInvalidFingerprintError If `fp_type` is not `Fingerprint`. E3FPEfficiencyWarning If `bits` is over 2^14 = 16384. """ if self.fp_type is not Fingerprint: raise E3FPInvalidFingerprintError( "Only binary `Fingerprint` databases may be saved to " "bitstrings." ) if self.bits > 2 ** 14: warnings.warn( ( "Saving sparse bitstrings to text file is highly " "inefficient for large bit lengths" ), category=E3FPEfficiencyWarning, stacklevel=2, ) row_fmt = "{0:s}" if with_names: row_fmt += " {1:s}" with smart_open(fn, "w") as f: for i in range(self.fp_num): # Much more efficient to access underlying arrays indices = self.array.indices[ self.array.indptr[i] : self.array.indptr[i + 1] ] bs = "1".join( [ "0" * j for j in np.diff(np.r_[-1, indices, self.bits]) - 1 ] ) f.write(row_fmt.format(bs, self.fp_names[i]) + "\n")
def params_to_molecules(params, smiles_file, conf_dir, out_dir, parallelizer=None): """Generate molecules_file based on params dict.""" smiles_dict = smiles_to_dict(smiles_file) logging.debug("SMILES file has {:d} unique smiles.".format( len(smiles_dict))) logging.debug("Example SMILES: {!r}".format(smiles_dict.items()[0])) fprint_params = {"radius_multiplier": params["radius_multiplier"], "stereo": STEREO, "bits": params["bits"], "first": params['first'], "level": params['level']} conf_dir_files = glob.glob("{!s}/*".format(conf_dir)) logging.debug("Found {:d} files in conformer directory.".format( len(conf_dir_files))) sdf_files = [x for x in conf_dir_files if os.path.basename(x).split('.')[0] in smiles_dict] logging.debug("{:d} conformer files match SMILES.".format(len(sdf_files))) if len(sdf_files) == 0: raise Exception("Directory {!s} does not contain any usable SDF " "files.".format(conf_dir)) kwargs = {"save": False, "fprint_params": fprint_params} data_iterator = make_data_iterator(sdf_files) if parallelizer is not None: results_iter = parallelizer.run_gen(native_tuples_from_sdf, data_iterator, kwargs=kwargs) else: results_iter = (native_tuples_from_sdf(*x, **kwargs) for x in data_iterator) molecules_file = get_molecules_file(out_dir) fp_type = fprint_params_to_fptype(**params) with smart_open(molecules_file, "wb") as f: writer = csv.writer(f) fp_type.write(writer) writer.writerow(("molecule id", "smiles", "fingerprint")) for results in results_iter: try: fp_native_list, sdf_file = results except ValueError: logging.error("Results of fingerprinting did not look as " "expected: {!r}".format(results)) proto_name = MolItemName.from_str(fp_native_list[0][1]).proto_name smiles = smiles_dict[proto_name] for fp_native, fp_name in fp_native_list: writer.writerow((fp_name, smiles, fp_native)) del smiles_dict filtered_smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts( molecules_file) return (filtered_smiles_dict, mol_lists_dict, fp_type)
def save(self, fn="fingerprints.fps.bz2"): """Save database to file. Parameters ---------- fn : str, optional Filename or basename if extension does not include '.fps' """ if ".fps" not in fn: fn += ".fps.bz2" with smart_open(fn, "w") as f: pkl.dump(self, f)
def lists_dicts_to_molecules(molecules_file, smiles_dict, mol_lists_dict, fp_type): """Write dict of mol names to list of native tuples to a molecules file.""" with smart_open(molecules_file, "wb") as f: writer = csv.writer(f) fp_type.write(writer) writer.writerow(("molecule id", "smiles", "fingerprint")) for mol_name in sorted(smiles_dict.keys()): smiles = smiles_dict[mol_name] fp_list = mol_lists_dict.get(mol_name, []) for fp_native, fp_name in fp_list: writer.writerow((fp_name, smiles, fp_native))
def _save(f, *fps, **kwargs): default_dict = {'protocol': None} default_dict.update(kwargs) protocol = default_dict["protocol"] with smart_open(f, "w") as fh: if protocol is None: protocol = pkl.HIGHEST_PROTOCOL for fp in fps: pkl.dump(fp, fh, protocol) return True
def load(cls, fn): """Load database from file. The extension is used to determine how database was serialized (`save` vs `savez`). Parameters ---------- fn : str Filename Returns ------- FingerprintDatabase Database """ if fn.endswith(".fpz"): if scipy.__version__ < "1.0": warnings.warn( ( "Use SciPy 1.0 or newer to efficiently load large " "FingerprintDatabases." ), category=E3FPEfficiencyWarning, stacklevel=2, ) array_dict = dict(np.load(fn, allow_pickle=True).items()) props_dict = {} for k in list(array_dict.keys()): if k.startswith("_"): v = array_dict.pop(k) props_dict[k[1:]] = v array = csr_matrix( ( array_dict["data"], array_dict["indices"], array_dict["indptr"], ), shape=array_dict["shape"], ) return FingerprintDatabase.from_array( array, array_dict["fp_names"], fp_type=array_dict["fp_type"].item(), level=array_dict["level"].item(), name=array_dict["name"].item(), props=props_dict, ) else: with smart_open(fn) as f: return pkl.load(f)
def process_fingerprints(self, fprint_dict): new_fprint_dict = {} if self.mode == "union": for mol_name, fprints in fprint_dict.iteritems(): new_fprint_dict[mol_name] = [ fp.Fingerprint.from_fingerprint(fp.add(fprints)) ] new_fprint_dict[mol_name][0].name = mol_name elif self.mode == "mean": for mol_name, fprints in fprint_dict.iteritems(): new_fprint_dict[mol_name] = [fp.mean(fprints)] new_fprint_dict[mol_name][0].name = mol_name elif self.mode == "mean-boltzmann": energies_dict = {} with smart_open(self.energies_file, "r") as f: for line in f: name, energy = line.rstrip().split('\t') energies_dict[name] = float(energy) for mol_name, fprints in fprint_dict.iteritems(): energies = np.array( [energies_dict[fprint.name] for fprint in fprints]) # factor out max term to reduce overflow e_min = energies.min() adjusted_energies = energies - e_min probs = np.exp(-adjusted_energies / KT) prob_sum = np.sum(probs) if prob_sum == 0.: logging.warning( ("Boltzmann probabilities for {} sum to 0. Using " "unweighted mean.").format(mol_name)) new_fprint_dict[mol_name] = [fp.mean(fprints)] else: if prob_sum == 1. and probs.shape[0] > 1: logging.info( ("Boltzmann probabilities for {} dominated by 1 " "term.").format(mol_name)) new_fprint_dict[mol_name] = [ fp.mean(fprints, weights=probs) ] new_fprint_dict[mol_name][0].name = mol_name elif self.mode == "first": new_fprint_dict = {} for mol_name, fprints in fprint_dict.iteritems(): new_fprint_dict[mol_name] = [] for proto_name, proto_fprints in itertools.groupby( fprints, key=lambda x: x.name.split('_')[0]): first_fprint = copy.deepcopy(list(proto_fprints)[0]) first_fprint.name = proto_name new_fprint_dict[mol_name].append(first_fprint) return new_fprint_dict
def load(cls, fn): """Load database from file. Parameters ---------- fn : str Filename Returns ------- FingerprintDatabase Dabatase """ with smart_open(fn) as f: return pkl.load(f)
def prc_roc_aucs_from_cv_dirs(cv_dirs): aucs_list = [] for cv_dir in cv_dirs: log_file = glob.glob(os.path.join(cv_dir, "log.txt"))[0] with smart_open(log_file, "r") as f: for line in f: try: m = re.search('Fold.*AUROC of (0\.\d+).*AUPRC of (0\.\d+)', line) aucs = float(m.group(1)), float(m.group(2)) aucs_list.append(aucs) except AttributeError: continue aurocs, auprcs = zip(*aucs_list) return aurocs, auprcs
def load(self): """Load memmap file and entry names file.""" self.array = np.memmap(self.memmap_file, mode="r", dtype=self.dtype) self.entry_names = [] with smart_open(self.entry_names_file, "r") as f: for line in f: line = line.rstrip() if len(line) > 0: self.entry_names.append(line) size = len(self.entry_names) if self._get_tril_index_from_indices( size - 1, size - 2) != self.array.shape[0] - 1: raise ValueError(("Number of items in memmap does not match " "number of row names.")) self.shape = (size, size) self.update_name_to_index_map()
def _load(f, update_structure=True): fps = [] with smart_open(f, "r") as fh: try: while True: fp = pkl.load(fh) if update_structure: try: fps.append(fp.__class__.from_fingerprint(fp)) except AttributeError: fps.append(fp) else: fps.append(fp) except EOFError: pass return fps
def native_tuples_to_molecules(molecules_file, native_tuples_lists_iter, smiles_dict, fp_type): """Given an iterable of native tuples lists, write to molecules file.""" with smart_open(molecules_file, "wb") as f: writer = csv.writer(f) fp_type.write(writer) writer.writerow(("molecule id", "smiles", "fingerprint")) for i, native_tuples_list in enumerate(native_tuples_lists_iter): logging.debug( "Wrote native strings for molecule {:d} to molecules file.". format(i + 1)) # smiles = smiles_dict[mol_name] for fp_native, fp_name in native_tuples_list: mol_item_name = MolItemName.from_str(native_tuples_list[0][1]) smiles = smiles_dict.get( mol_item_name.proto_name, smiles_dict.get(mol_item_name.mol_name)) writer.writerow((fp_name, smiles, fp_native))
def main(sdf_dir, out_sdf_file, first=3): sdf_files = glob.glob(os.path.join(sdf_dir, "*sdf*")) sdf_files = sorted(sdf_files, key=mol_conf_id_from_fn) with smart_open(out_sdf_file, "wb") as fobj: writer = rdkit.Chem.SDWriter(fobj) for j, sdf_file in enumerate(sdf_files): mol = mol_from_sdf(sdf_file, conf_num=FIRST + 1) proto_name = mol.GetProp("_Name") mol_name, _ = mol_conf_id_from_fn(proto_name) mol.SetProp("_Name", mol_name) conf_ids = [conf.GetId() for conf in mol.GetConformers()] for i in conf_ids: if i >= first and first not in (-1, None): break writer.write(mol, confId=i) if j > 0 and j % 100 == 0: print(j) writer.close()
def target_aucs_from_cv_dirs(cv_dirs): target_aurocs_dict = {} target_auprcs_dict = {} if isinstance(cv_dirs, str): cv_dirs = [cv_dirs] for cv_dir in cv_dirs: log_file = glob.glob(os.path.join(cv_dir, "log.txt"))[0] with smart_open(log_file, "r") as f: for line in f: try: m = re.search( 'Target ([\w\d]+) .*AUROC of (0\.\d+).*AUPRC of (0\.\d+)', line) tid = m.group(1) aucs = float(m.group(2)), float(m.group(3)) target_aurocs_dict.setdefault(tid, []) target_auprcs_dict.setdefault(tid, []) target_aurocs_dict[tid].append(aucs[0]) target_auprcs_dict[tid].append(aucs[1]) except AttributeError: continue return target_aurocs_dict, target_auprcs_dict
def mol_to_sdf(mol, out_file, conf_num=None): """Write RDKit ``Mol`` objects to an SDF file. Parameters ---------- mol : RDKit Mol A molecule containing 1 or more conformations to write to file. out_file : str Path to save SDF file. conf_num : int or None, optional Maximum number of conformers to save to file. Defaults to all. """ touch_dir(os.path.dirname(out_file)) with smart_open(out_file, "wb") as fobj: writer = rdkit.Chem.SDWriter(fobj) conf_ids = [conf.GetId() for conf in mol.GetConformers()] for i in conf_ids: if conf_num not in {-1, None} and i >= conf_num: break writer.write(mol, confId=i) writer.close() logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
def compute_average_metrics(cv_dir, thresh=None): """Compute fold metrics averaged across fold.""" input_file = os.path.join(cv_dir, "inputs.pkl.bz2") fold_dirs = glob.glob(os.path.join(cv_dir, "*/")) logging.debug("Loading input files.") with smart_open(input_file, "rb") as f: (fp_array, mol_to_fp_inds, target_mol_array, target_list, mol_list) = pkl.load(f) del fp_array, mol_to_fp_inds, target_list, mol_list if issparse(target_mol_array): target_mol_array = target_mol_array.toarray().astype(np.bool) fold_metrics = [] for fold_dir in sorted(fold_dirs): mask_file = glob.glob(os.path.join(fold_dir, "*mask*"))[0] results_file = glob.glob(os.path.join(fold_dir, "*result*"))[0] fold_metric = compute_fold_metrics(target_mol_array, mask_file, results_file, thresh=thresh) fold_metrics.append(fold_metric) fold_metrics = np.asarray(fold_metrics) mean_metrics = fold_metrics.mean(axis=0) std_metrics = fold_metrics.std(axis=0) logging.debug( ("P-value: {:.4g} +/- {:.4g} " "Sensitivity: {:.4f} +/- {:.4f} " "Specificity: {:.4f} +/- {:.4f} " "Precision: {:.4f} +/- {:.4f} " "F1: {:.4f} +/- {:.4f}").format(mean_metrics[0], std_metrics[0], mean_metrics[1], std_metrics[1], mean_metrics[2], std_metrics[2], mean_metrics[3], std_metrics[3], mean_metrics[4], std_metrics[4])) return mean_metrics