Пример #1
0
    def __init__(self,
                 uid,
                 max_request=100,
                 client_wait_time=10,
                 server_wait_time=1):

        self.data_dirname = safe_path_join("data",
                                           uid,
                                           dirname=FINGERPRINTS_PATH)
        self.backup_dirname = safe_path_join("data",
                                             uid + ".backup",
                                             dirname=FINGERPRINTS_PATH)
        self.cache_dirname = safe_path_join("cache",
                                            uid,
                                            dirname=FINGERPRINTS_PATH)
        self.pending_dirname = safe_path_join("pending",
                                              uid,
                                              dirname=FINGERPRINTS_PATH)
        self.lock_dirname = safe_path_join("cache",
                                           uid + ".lock",
                                           dirname=FINGERPRINTS_PATH)
        self.uid = uid
        self.max_request = max_request
        self.client_wait_time = client_wait_time
        self.server_wait_time = server_wait_time
Пример #2
0
 def add_pending_hx(self, hx):
     if not os.path.exists(self.pending_dirname):
         os.makedirs(self.pending_dirname)
     dirname = safe_path_join(str(os.getpid()), dirname=self.cache_dirname)
     lock_dirname = safe_path_join(str(os.getpid()),
                                   dirname=self.pending_dirname)
     assert not os.path.exists(dirname)
     assert not os.path.exists(lock_dirname)
     dd = DataDict()
     dd["hx"] = Variable(hx)
     dd._save(lock_dirname, self.uid)
     return dirname, lock_dirname
Пример #3
0
 def get_all_pending_hx(self):
     if not os.path.exists(self.pending_dirname) or len(
             os.listdir(self.pending_dirname)) == 0:
         return self.hash_x(np.array([], dtype=np.str))
     else:
         result = []
         for dirname in os.listdir(self.pending_dirname):
             result.append(
                 DataDict(source_dirname=safe_path_join(
                     dirname, dirname=self.pending_dirname))["hx"])
         return np.unique(np.concatenate(result))
Пример #4
0
def export(target_uid, threshold, include_earliest_year,
           ic50_conversion_strategy, fit_ic50):
    mandalka.config(fake_del_arguments=True)  # necessary to describe nodes
    dirname = ("target:{}--threshold:{}--year:{}--which:{}"
               "--fitIC50:{}".format(
                   target_uid,
                   threshold,
                   include_earliest_year,
                   ic50_conversion_strategy,
                   fit_ic50,
               ))
    output = os.path.join(
        os.getenv("ANANAS_RESULTS_PATH"),
        "elderberries-Benchmarks2018",
        "data_pipelines",
        dirname,
    )
    if not os.path.exists(output):
        os.makedirs(output)
    else:
        if not os.path.isdir(output):
            raise OSError("remove {}".format(output))
    if len(target_uid
           ) > 6 and target_uid[:6] == "CHEMBL" and target_uid[6:].isnumeric():
        print("exporting target {}...".format(target_uid))
        pipeline = mean_warszycki_logki(
            target_uid=target_uid,
            chembl_filename="chembl_24.db",
            threshold=threshold,
            include_earliest_year=include_earliest_year,
            ic50_conversion_strategy=ic50_conversion_strategy,
            fit_ic50=fit_ic50,
        )
        dirname = safe_path_join(target_uid, dirname=output)
        if os.path.exists(dirname):
            print("{} already exists, skipping...".format(dirname))
        else:
            dump_nodes_to_html(
                pipeline["data_nodes"],
                dirname,
            )
    else:
        print("{} is not a valid target, skipping...".format(target_uid))
Пример #5
0
def dump_nodes_to_html(nodes, dirname):
    describe = lambda node, depth: mandalka.unique_id(
        node) + ": " + mandalka.describe(node, depth)
    href_chembl_compound = lambda uid: href(
        "https://www.ebi.ac.uk/chembl/compound/inspect/{}".format(uid),
        uid,
    )
    href_chembl_document = lambda uid: href(
        "https://www.ebi.ac.uk/chembl/doc/inspect/{}".format(uid),
        uid,
    )

    def dump_node_to_html(node, fname):
        arr, header = to_arr_header(node.data)
        width = columns_width(arr, header, 30)
        arr, header = sanitize_html(arr), sanitize_html(header)
        for i, key in enumerate(header):
            if "uid" in key and not "doc" in key:
                arr[:, i] = np.vectorize(href_chembl_compound,
                                         otypes=(np.str, ))(arr[:, i])
            if "uid" in key and "doc" in key:
                arr[:, i] = np.vectorize(href_chembl_document,
                                         otypes=(np.str, ))(arr[:, i])
        with open(fname, 'x') as f_out:
            f_out.write(
                doc_template(
                    style_template(
                        table_style_1("data_table"),
                        div_style_1(None),
                    ) + '\n' + tablesorter(),
                    to_html(arr, header, width, "data_table"),
                ))

    os.makedirs(dirname)
    os.makedirs(safe_path_join("per_uid", dirname=dirname))
    os.makedirs(safe_path_join("per_node", dirname=dirname))
    body = []
    for node in nodes:
        nname = sanitize_html(describe(node, depth=1))
        fname = "{}.html".format(node)
        dump_node_to_html(node,
                          safe_path_join("per_node", fname, dirname=dirname))
        body.append(div_template(href(fname, nname)))
        body.append('<pre white-space="pre-wrap">' +
                    sanitize_html(str(node.data)) + '</pre>')
    with open(safe_path_join("per_node", "index.html", dirname=dirname),
              'x') as f_out:
        f_out.write(
            doc_template(
                style_template(div_style_1(None), ),
                '\n'.join(body),
            ))

    href_uid = lambda uid: href("{}.html".format(uid), uid)
    all_uids = sorted(set([uid for n in nodes for uid in n.data["uid"]]))
    with open(safe_path_join("per_uid", "index.html", dirname=dirname),
              'x') as f_out:
        f_out.write(
            doc_template(
                "",
                '\t'.join([href_uid(uid) for uid in all_uids]),
            ))

    body = []
    for i in range(len(nodes)):
        u1 = set(nodes[i - 1].data["uid"]) if i > 0 else set()
        u2 = set(nodes[i].data["uid"])
        new_uids = '\t'.join([href_uid(uid) for uid in sorted(u2 - u1)])
        deleted_uids = '\t'.join([href_uid(uid) for uid in sorted(u1 - u2)])
        body += [
            div_template("&#8680;\t" +
                         sanitize_html(describe(nodes[i], depth=1))),
            div_template("NEW UIDS"),
            div_template(new_uids),
            div_template("DELETED UIDS"),
            div_template(deleted_uids),
        ]
    with open(safe_path_join("per_uid", "deltas.html", dirname=dirname),
              'x') as f_out:
        f_out.write(
            doc_template(
                style_template(div_style_1()),
                '\n'.join(body),
            ))

    for iuid, uid in enumerate(tqdm.tqdm(all_uids)):
        body = []
        body.append(div_template(href_chembl_compound(uid)))
        for n in nodes:
            body.append(div_template(sanitize_html(describe(n, depth=1))))
            data = n.data.slice[n.data["uid"] == uid]
            arr, header = to_arr_header(data)
            width = columns_width(arr, header, 30)
            arr, header = sanitize_html(arr), sanitize_html(header)
            for i, key in enumerate(header):
                if "uid" in key:
                    arr[:, i] = np.vectorize(href_uid,
                                             otypes=(np.str, ))(arr[:, i])
            body.append(to_html(arr, header, width, "data_table"))
        body.append(
            div_template(' '.join([
                href_uid(all_uids[iuid - 1]),
                href("index.html", "INDEX"),
                href_uid(all_uids[(iuid + 1) % len(all_uids)]),
            ])))
        with open(safe_path_join("per_uid", uid + ".html", dirname=dirname),
                  'x') as f_out:
            f_out.write(
                doc_template(
                    style_template(
                        table_style_1("data_table"),
                        div_style_1() + '\n' + tablesorter(),
                    ),
                    '\n'.join(body),
                ))
Пример #6
0
def plot_all(summary_name, summary, reference_solutions):

    output_dir = safe_path_join(
        "elderberries-Benchmarks2018",
        "scores",
        summary_name,
        "figures",
        dirname=os.getenv("ANANAS_RESULTS_PATH"),
    )
    os.makedirs(output_dir)

    metric_pairs = [(metric, metric, reversed_metric)
                    for metric, (_,
                                 reversed_metric) in summary.metrics.items()]
    for metric in summary.closer_further_metrics:
        metric_pairs.append(
            (metric, metric + "_Closer", summary.metrics[metric][1]))
        metric_pairs.append(
            (metric, metric + "_Further", summary.metrics[metric][1]))

    for cv_metric, bac_metric, reversed_metric in metric_pairs:

        metric_plot_name = "CV_{}_BAC_{}".format(cv_metric, bac_metric)
        header, cv = summary.results(cv_metric, "cv")
        header, bac = summary.results(bac_metric, "bac")
        assert np.all(cv[0] == bac[0])
        solutions = cv[0]

        _ref_mask = np.vectorize(lambda x: x in reference_solutions)(solutions)
        _cv = np.concatenate(
            [cv[i].reshape(-1, 1) for i in range(1,
                                                 len(header) - 1)], axis=1)
        _bac = np.concatenate(
            [bac[i].reshape(-1, 1) for i in range(1,
                                                  len(header) - 1)], axis=1)

        if not reversed_metric:
            _cv, _bac = -_cv, -_bac
        xlim = np.array(
            (_cv.min(), min(_cv[_ref_mask].max() * 2 - _cv.min(), _cv.max())))
        ylim = np.array(
            (_bac.min(), min(_bac[_ref_mask].max() * 2 - _bac.min(),
                             _bac.max())))
        if not reversed_metric:
            xlim, ylim = -xlim[::-1], -ylim[::-1]
        xlim += .05 * np.array((xlim[0] - xlim[1], xlim[1] - xlim[0]))
        ylim += .05 * np.array((ylim[0] - ylim[1], ylim[1] - ylim[0]))
        xlabel = "cross validation score"
        ylabel = "balanced agglomerative clustering score"
        for i in range(1, 13):
            if i == len(header) - 1:  # ranking
                xlim = ylim = (-3, len(solutions) + 3)
                reversed_metric = True
            plot(safe_path_join(metric_plot_name + '_' + header[i] + ".svg",
                                dirname=output_dir),
                 solutions,
                 cv[i],
                 bac[i],
                 title=header[i],
                 reference_solutions=reference_solutions,
                 xlabel=xlabel,
                 ylabel=ylabel,
                 xlim=xlim,
                 ylim=ylim,
                 best_only=False,
                 color_flat=False,
                 legend=True,
                 reversed_metric=reversed_metric)
            plot(safe_path_join(metric_plot_name + '_' + header[i] + "_.svg",
                                dirname=output_dir),
                 solutions,
                 cv[i],
                 bac[i],
                 title=header[i],
                 reference_solutions=reference_solutions,
                 xlabel=xlabel,
                 ylabel=ylabel,
                 xlim=xlim,
                 ylim=ylim,
                 best_only=False,
                 color_flat=True,
                 legend=True,
                 reversed_metric=reversed_metric)
            plot(safe_path_join(metric_plot_name + '_' + header[i] + "__.svg",
                                dirname=output_dir),
                 solutions,
                 cv[i],
                 bac[i],
                 title=header[i],
                 reference_solutions=reference_solutions,
                 xlabel=xlabel,
                 ylabel=ylabel,
                 xlim=xlim,
                 ylim=ylim,
                 best_only=True,
                 color_flat=False,
                 legend=False,
                 reversed_metric=reversed_metric)
            plot(safe_path_join(metric_plot_name + '_' + header[i] + "___.svg",
                                dirname=output_dir),
                 solutions,
                 cv[i],
                 bac[i],
                 title=header[i],
                 reference_solutions=reference_solutions,
                 xlabel=xlabel,
                 ylabel=ylabel,
                 xlim=xlim,
                 ylim=ylim,
                 best_only=True,
                 color_flat=True,
                 legend=False,
                 reversed_metric=reversed_metric)
Пример #7
0
                 ylabel=ylabel,
                 xlim=xlim,
                 ylim=ylim,
                 best_only=True,
                 color_flat=True,
                 legend=False,
                 reversed_metric=reversed_metric)


if __name__ == "__main__":

    for summary_name, summary, reference_solutions in SUMMARIES:

        output = safe_path_join(
            "elderberries-Benchmarks2018",
            "scores",
            summary_name,
            dirname=os.getenv("ANANAS_RESULTS_PATH"),
        )
        try:
            os.makedirs(output)
            plot_all(summary_name, summary, reference_solutions)
            output2 = safe_path_join("tables", dirname=output)
            os.makedirs(output2)
            for score in summary.metrics:
                for split in ["bac", "cv"]:
                    fname = score + '-' + split + ".html"
                    header, columns = summary.results(score, split)
                    arr = np.concatenate(
                        [c.astype(np.str).reshape(-1, 1) for c in columns],
                        axis=1)
                    width = columns_width(arr, header, 30)