def main():
    output = pd.ExcelWriter(snakemake.output[0])
    for i, data_file in enumerate(snakemake.input["data"]):
        l = cb.data.read_hybrid_path("{file}//obs/{label}".format(
            file=data_file, label=snakemake.config["label"]))
        b = cb.data.read_hybrid_path("{file}//obs/{batch}".format(
            file=data_file, batch=snakemake.config["batch"]))
        mask = utils.na_mask(l)
        l, b = l[~mask], b[~mask]
        df_list = []
        for _b in np.unique(b):
            _l = l[b == _b]
            uniq, population = np.unique(_l, return_counts=True)
            proportion = population / population.sum()
            df = pd.DataFrame({
                "population": np.vectorize(str)(population),
                "proportion": np.vectorize(lambda x: "%.1f%%" % x)(proportion * 100),
                snakemake.config["label"]: uniq
            })
            df[str(_b)] = df["population"] + " (" + df["proportion"] + ")"
            del df["population"], df["proportion"]
            df_list.append(df)
        df = functools.reduce(lambda x, y: pd.merge(
            x, y, how="outer", on=snakemake.config["label"]
        ), df_list).fillna("0 (0.0%)")

        sheet_name = "group_%d" % (i + 1)
        df.to_excel(output, sheet_name=sheet_name, index=False)
    output.save()
Пример #2
0
def main():
    y = cb.data.read_hybrid_path("//".join(
        [snakemake.input.data,
         "obs/%s" % snakemake.config["label"]]))
    mask = utils.na_mask(y)
    y = y[~mask]
    y = cb.utils.encode_integer(y)[0]
    b = cb.data.read_hybrid_path("//".join(
        [snakemake.input.data,
         "obs/%s" % snakemake.config["batch"]]))
    b = b[~mask]
    b = cb.utils.encode_integer(b)[0]
    x = cb.data.read_hybrid_path("//".join(
        [snakemake.input.result, snakemake.params.slot]))

    performance = dict(
        nearest_neighbor_accuracy=cb.metrics.nearest_neighbor_accuracy(x, y),
        mean_average_precision=cb.metrics.mean_average_precision_from_latent(
            x, y, k=snakemake.config["nn"]),
        seurat_alignment_score=cb.metrics.seurat_alignment_score(
            x, b, n=10, k=snakemake.config["nn"]),
        batch_mixing_entropy=cb.metrics.batch_mixing_entropy(x, b),
        time=float(
            cb.data.read_hybrid_path("//".join(
                [snakemake.input.result,
                 "time"]))),  # "Null" have time = 0 read as np.int64
        n_cell=x.shape[0])

    with open(snakemake.output[0], "w") as f:
        json.dump(performance, f, indent=4)
def main():
    y = cb.data.read_hybrid_path("//".join(
        [snakemake.input.data,
         "obs/%s" % snakemake.config["label"]]))
    y = y[~utils.na_mask(y)]
    y = cb.utils.encode_integer(y)[0]

    x = cb.data.read_hybrid_path("//".join([snakemake.input.result, "latent"]))
    performance = dict(
        nearest_neighbor_accuracy=cb.metrics.nearest_neighbor_accuracy(x, y),
        mean_average_precision=cb.metrics.mean_average_precision_from_latent(
            x, y, k=snakemake.config["nn"]),
        time=cb.data.read_hybrid_path("//".join(
            [snakemake.input.result, "time"])),
        n_cell=x.shape[0])

    with open(snakemake.output[0], "w") as f:
        json.dump(performance, f, indent=4)
Пример #4
0
def main():
    ref = np.concatenate([
        cb.data.read_hybrid_path("{file}//obs/{label}".format(
            file=item, label=snakemake.config["label"]))
        for item in snakemake.input.ref
    ])
    ref = ref[~utils.na_mask(ref)]
    pos_types = np.unique(ref)

    expect = pd.read_csv(snakemake.params.expect, index_col=0)

    # # Pos/neg weighed
    # true = np.concatenate([cb.data.read_hybrid_path("{file}//obs/{label}".format(
    #     file=item, label=snakemake.config["label"]
    # )) for item in snakemake.input.true])
    # true = true[~utils.na_mask(true)]
    # tp = np.in1d(true, pos_types)
    # tn = ~tp
    # weight = np.ones(true.size)
    # weight[tp] = 1 / tp.sum()
    # weight[tn] = 1 / tn.sum()
    # weight /= weight.sum() / weight.size

    # Dataset weighed
    true = [
        cb.data.read_hybrid_path("{file}//obs/{label}".format(
            file=item, label=snakemake.config["label"]))
        for item in snakemake.input.true
    ]
    true = [item[~utils.na_mask(item)] for item in true]
    weight = np.concatenate(
        [np.repeat(1 / item.size, item.size) for item in true])
    weight /= weight.sum() / weight.size
    true = np.concatenate(true)
    tp = np.in1d(true, pos_types)
    tn = ~tp

    pred_dict = collections.defaultdict(list)
    for item in snakemake.input.pred:
        with h5py.File(item, "r") as f:
            g = f["prediction"]
            for threshold in g:
                pred_dict[float(threshold)].append(
                    cb.data.read_clean(g[threshold][...]))

    cell_type_specific_excel = pd.ExcelWriter(snakemake.output[1])
    performance = []
    for threshold in sorted(pred_dict.keys(), key=float):
        pred = pred_dict[threshold] = np.concatenate(pred_dict[threshold])
        assert len(pred) == len(true)
        pn = np.vectorize(lambda x: x in
                          ("unassigned", "ambiguous", "rejected"))(pred)
        pp = ~pn
        sensitivity = (weight * np.logical_and(tp, pp)).sum() / (weight *
                                                                 tp).sum()
        specificity = (weight * np.logical_and(tn, pn)).sum() / (weight *
                                                                 tn).sum()
        class_specific_accuracy = cb.metrics.class_specific_accuracy(
            true, pred, expect)
        class_specific_accuracy.insert(
            0, "positive", np.in1d(class_specific_accuracy.index, pos_types))
        pos_mba = class_specific_accuracy.loc[
            class_specific_accuracy["positive"], "accuracy"].mean()
        neg_mba = class_specific_accuracy.loc[
            ~class_specific_accuracy["positive"], "accuracy"].mean()
        mba = (pos_mba + neg_mba) / 2
        performance.append(
            dict(ref_size=ref.size,
                 threshold=threshold,
                 sensitivity=sensitivity,
                 specificity=specificity,
                 pos_mba=pos_mba,
                 neg_mba=neg_mba,
                 mba=mba))
        class_specific_accuracy.to_excel(cell_type_specific_excel,
                                         str(threshold),
                                         index_label=snakemake.config["label"])
        cell_type_specific_excel.save()

    with open(snakemake.output[0], "w") as f:
        json.dump(performance, f, indent=4)