def __init__(self, uid, max_request=100, client_wait_time=10, server_wait_time=1): self.data_dirname = safe_path_join("data", uid, dirname=FINGERPRINTS_PATH) self.backup_dirname = safe_path_join("data", uid + ".backup", dirname=FINGERPRINTS_PATH) self.cache_dirname = safe_path_join("cache", uid, dirname=FINGERPRINTS_PATH) self.pending_dirname = safe_path_join("pending", uid, dirname=FINGERPRINTS_PATH) self.lock_dirname = safe_path_join("cache", uid + ".lock", dirname=FINGERPRINTS_PATH) self.uid = uid self.max_request = max_request self.client_wait_time = client_wait_time self.server_wait_time = server_wait_time
def add_pending_hx(self, hx): if not os.path.exists(self.pending_dirname): os.makedirs(self.pending_dirname) dirname = safe_path_join(str(os.getpid()), dirname=self.cache_dirname) lock_dirname = safe_path_join(str(os.getpid()), dirname=self.pending_dirname) assert not os.path.exists(dirname) assert not os.path.exists(lock_dirname) dd = DataDict() dd["hx"] = Variable(hx) dd._save(lock_dirname, self.uid) return dirname, lock_dirname
def get_all_pending_hx(self): if not os.path.exists(self.pending_dirname) or len( os.listdir(self.pending_dirname)) == 0: return self.hash_x(np.array([], dtype=np.str)) else: result = [] for dirname in os.listdir(self.pending_dirname): result.append( DataDict(source_dirname=safe_path_join( dirname, dirname=self.pending_dirname))["hx"]) return np.unique(np.concatenate(result))
def export(target_uid, threshold, include_earliest_year, ic50_conversion_strategy, fit_ic50): mandalka.config(fake_del_arguments=True) # necessary to describe nodes dirname = ("target:{}--threshold:{}--year:{}--which:{}" "--fitIC50:{}".format( target_uid, threshold, include_earliest_year, ic50_conversion_strategy, fit_ic50, )) output = os.path.join( os.getenv("ANANAS_RESULTS_PATH"), "elderberries-Benchmarks2018", "data_pipelines", dirname, ) if not os.path.exists(output): os.makedirs(output) else: if not os.path.isdir(output): raise OSError("remove {}".format(output)) if len(target_uid ) > 6 and target_uid[:6] == "CHEMBL" and target_uid[6:].isnumeric(): print("exporting target {}...".format(target_uid)) pipeline = mean_warszycki_logki( target_uid=target_uid, chembl_filename="chembl_24.db", threshold=threshold, include_earliest_year=include_earliest_year, ic50_conversion_strategy=ic50_conversion_strategy, fit_ic50=fit_ic50, ) dirname = safe_path_join(target_uid, dirname=output) if os.path.exists(dirname): print("{} already exists, skipping...".format(dirname)) else: dump_nodes_to_html( pipeline["data_nodes"], dirname, ) else: print("{} is not a valid target, skipping...".format(target_uid))
def dump_nodes_to_html(nodes, dirname): describe = lambda node, depth: mandalka.unique_id( node) + ": " + mandalka.describe(node, depth) href_chembl_compound = lambda uid: href( "https://www.ebi.ac.uk/chembl/compound/inspect/{}".format(uid), uid, ) href_chembl_document = lambda uid: href( "https://www.ebi.ac.uk/chembl/doc/inspect/{}".format(uid), uid, ) def dump_node_to_html(node, fname): arr, header = to_arr_header(node.data) width = columns_width(arr, header, 30) arr, header = sanitize_html(arr), sanitize_html(header) for i, key in enumerate(header): if "uid" in key and not "doc" in key: arr[:, i] = np.vectorize(href_chembl_compound, otypes=(np.str, ))(arr[:, i]) if "uid" in key and "doc" in key: arr[:, i] = np.vectorize(href_chembl_document, otypes=(np.str, ))(arr[:, i]) with open(fname, 'x') as f_out: f_out.write( doc_template( style_template( table_style_1("data_table"), div_style_1(None), ) + '\n' + tablesorter(), to_html(arr, header, width, "data_table"), )) os.makedirs(dirname) os.makedirs(safe_path_join("per_uid", dirname=dirname)) os.makedirs(safe_path_join("per_node", dirname=dirname)) body = [] for node in nodes: nname = sanitize_html(describe(node, depth=1)) fname = "{}.html".format(node) dump_node_to_html(node, safe_path_join("per_node", fname, dirname=dirname)) body.append(div_template(href(fname, nname))) body.append('<pre white-space="pre-wrap">' + sanitize_html(str(node.data)) + '</pre>') with open(safe_path_join("per_node", "index.html", dirname=dirname), 'x') as f_out: f_out.write( doc_template( style_template(div_style_1(None), ), '\n'.join(body), )) href_uid = lambda uid: href("{}.html".format(uid), uid) all_uids = sorted(set([uid for n in nodes for uid in n.data["uid"]])) with open(safe_path_join("per_uid", "index.html", dirname=dirname), 'x') as f_out: f_out.write( doc_template( "", '\t'.join([href_uid(uid) for uid in all_uids]), )) body = [] for i in range(len(nodes)): u1 = set(nodes[i - 1].data["uid"]) if i > 0 else set() u2 = set(nodes[i].data["uid"]) new_uids = '\t'.join([href_uid(uid) for uid in sorted(u2 - u1)]) deleted_uids = '\t'.join([href_uid(uid) for uid in sorted(u1 - u2)]) body += [ div_template("⇨\t" + sanitize_html(describe(nodes[i], depth=1))), div_template("NEW UIDS"), div_template(new_uids), div_template("DELETED UIDS"), div_template(deleted_uids), ] with open(safe_path_join("per_uid", "deltas.html", dirname=dirname), 'x') as f_out: f_out.write( doc_template( style_template(div_style_1()), '\n'.join(body), )) for iuid, uid in enumerate(tqdm.tqdm(all_uids)): body = [] body.append(div_template(href_chembl_compound(uid))) for n in nodes: body.append(div_template(sanitize_html(describe(n, depth=1)))) data = n.data.slice[n.data["uid"] == uid] arr, header = to_arr_header(data) width = columns_width(arr, header, 30) arr, header = sanitize_html(arr), sanitize_html(header) for i, key in enumerate(header): if "uid" in key: arr[:, i] = np.vectorize(href_uid, otypes=(np.str, ))(arr[:, i]) body.append(to_html(arr, header, width, "data_table")) body.append( div_template(' '.join([ href_uid(all_uids[iuid - 1]), href("index.html", "INDEX"), href_uid(all_uids[(iuid + 1) % len(all_uids)]), ]))) with open(safe_path_join("per_uid", uid + ".html", dirname=dirname), 'x') as f_out: f_out.write( doc_template( style_template( table_style_1("data_table"), div_style_1() + '\n' + tablesorter(), ), '\n'.join(body), ))
def plot_all(summary_name, summary, reference_solutions): output_dir = safe_path_join( "elderberries-Benchmarks2018", "scores", summary_name, "figures", dirname=os.getenv("ANANAS_RESULTS_PATH"), ) os.makedirs(output_dir) metric_pairs = [(metric, metric, reversed_metric) for metric, (_, reversed_metric) in summary.metrics.items()] for metric in summary.closer_further_metrics: metric_pairs.append( (metric, metric + "_Closer", summary.metrics[metric][1])) metric_pairs.append( (metric, metric + "_Further", summary.metrics[metric][1])) for cv_metric, bac_metric, reversed_metric in metric_pairs: metric_plot_name = "CV_{}_BAC_{}".format(cv_metric, bac_metric) header, cv = summary.results(cv_metric, "cv") header, bac = summary.results(bac_metric, "bac") assert np.all(cv[0] == bac[0]) solutions = cv[0] _ref_mask = np.vectorize(lambda x: x in reference_solutions)(solutions) _cv = np.concatenate( [cv[i].reshape(-1, 1) for i in range(1, len(header) - 1)], axis=1) _bac = np.concatenate( [bac[i].reshape(-1, 1) for i in range(1, len(header) - 1)], axis=1) if not reversed_metric: _cv, _bac = -_cv, -_bac xlim = np.array( (_cv.min(), min(_cv[_ref_mask].max() * 2 - _cv.min(), _cv.max()))) ylim = np.array( (_bac.min(), min(_bac[_ref_mask].max() * 2 - _bac.min(), _bac.max()))) if not reversed_metric: xlim, ylim = -xlim[::-1], -ylim[::-1] xlim += .05 * np.array((xlim[0] - xlim[1], xlim[1] - xlim[0])) ylim += .05 * np.array((ylim[0] - ylim[1], ylim[1] - ylim[0])) xlabel = "cross validation score" ylabel = "balanced agglomerative clustering score" for i in range(1, 13): if i == len(header) - 1: # ranking xlim = ylim = (-3, len(solutions) + 3) reversed_metric = True plot(safe_path_join(metric_plot_name + '_' + header[i] + ".svg", dirname=output_dir), solutions, cv[i], bac[i], title=header[i], reference_solutions=reference_solutions, xlabel=xlabel, ylabel=ylabel, xlim=xlim, ylim=ylim, best_only=False, color_flat=False, legend=True, reversed_metric=reversed_metric) plot(safe_path_join(metric_plot_name + '_' + header[i] + "_.svg", dirname=output_dir), solutions, cv[i], bac[i], title=header[i], reference_solutions=reference_solutions, xlabel=xlabel, ylabel=ylabel, xlim=xlim, ylim=ylim, best_only=False, color_flat=True, legend=True, reversed_metric=reversed_metric) plot(safe_path_join(metric_plot_name + '_' + header[i] + "__.svg", dirname=output_dir), solutions, cv[i], bac[i], title=header[i], reference_solutions=reference_solutions, xlabel=xlabel, ylabel=ylabel, xlim=xlim, ylim=ylim, best_only=True, color_flat=False, legend=False, reversed_metric=reversed_metric) plot(safe_path_join(metric_plot_name + '_' + header[i] + "___.svg", dirname=output_dir), solutions, cv[i], bac[i], title=header[i], reference_solutions=reference_solutions, xlabel=xlabel, ylabel=ylabel, xlim=xlim, ylim=ylim, best_only=True, color_flat=True, legend=False, reversed_metric=reversed_metric)
ylabel=ylabel, xlim=xlim, ylim=ylim, best_only=True, color_flat=True, legend=False, reversed_metric=reversed_metric) if __name__ == "__main__": for summary_name, summary, reference_solutions in SUMMARIES: output = safe_path_join( "elderberries-Benchmarks2018", "scores", summary_name, dirname=os.getenv("ANANAS_RESULTS_PATH"), ) try: os.makedirs(output) plot_all(summary_name, summary, reference_solutions) output2 = safe_path_join("tables", dirname=output) os.makedirs(output2) for score in summary.metrics: for split in ["bac", "cv"]: fname = score + '-' + split + ".html" header, columns = summary.results(score, split) arr = np.concatenate( [c.astype(np.str).reshape(-1, 1) for c in columns], axis=1) width = columns_width(arr, header, 30)