示例#1
0
    def precision_recall_curve(values,
                               true_set=None,
                               false_set=None,
                               fdr_thres=0.01,
                               return_curve=False):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = values[values.index.isin(index_set)]
        y_true = rank.index.isin(true_set).astype(int)

        ap = average_precision_score(y_true, -rank)

        precision, recall, thres = precision_recall_curve(y_true, -rank)
        recall_fdr = recall[precision > (1 - fdr_thres)].max()

        res = ((ap, recall_fdr, precision, recall, thres) if return_curve else
               (ap, recall_fdr))

        return res
示例#2
0
    def aroc_threshold(values,
                       true_set=None,
                       false_set=None,
                       fpr_thres=0.01,
                       return_curve=False):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = values[values.index.isin(index_set)]
        y_true = rank.index.isin(true_set).astype(int)

        fpr, tpr, thres = roc_curve(y_true, -rank)

        auc_fpr = roc_auc_score(y_true, -rank, max_fpr=fpr_thres)

        if fpr_thres is not None:
            fc_thres_fpr = -min(thres[fpr <= fpr_thres])

        else:
            fc_thres_fpr = None

        res = ((auc_fpr, fc_thres_fpr, fpr, tpr) if return_curve else
               (auc_fpr, fc_thres_fpr))

        return res
示例#3
0
def define_sgrnas_sets(clib, fc=None, add_controls=True, dataset_name="Yusa_v1"):
    sgrna_sets = dict()

    # sgRNA essential
    sgrnas_essential = Utils.get_essential_genes(return_series=False)
    sgrnas_essential = set(clib[clib["Gene"].isin(sgrnas_essential)].index)
    sgrnas_essential_fc = (
        None if fc is None else fc.reindex(sgrnas_essential).median(1).dropna()
    )

    sgrna_sets["essential"] = dict(
        color="#e6550d", sgrnas=sgrnas_essential, fc=sgrnas_essential_fc
    )

    # sgRNA non-essential
    sgrnas_nonessential = Utils.get_non_essential_genes(return_series=False)
    sgrnas_nonessential = set(clib[clib["Gene"].isin(sgrnas_nonessential)].index)
    sgrnas_nonessential_fc = (
        None if fc is None else fc.reindex(sgrnas_nonessential).median(1).dropna()
    )

    sgrna_sets["nonessential"] = dict(
        color="#3182bd", sgrnas=sgrnas_nonessential, fc=sgrnas_nonessential_fc
    )

    # sgRNA non-targeting
    if add_controls:
        if dataset_name in ["Yusa_v1", "Yusa v1", "Yusa_v1.1", "Yusa v1.1", "Sabatini_Lander_AML"]:
            sgrnas_control = {i for i in clib.index if i.startswith("CTRL0")}
        else:
            sgrnas_control = set(
                clib[[i.startswith("NO_CURRENT_") for i in clib["Gene"]]].index
            )

        sgrnas_control_fc = fc.reindex(sgrnas_control).median(1).dropna()

        sgrna_sets["nontargeting"] = dict(
            color="#31a354",
            sgrnas=sgrnas_control,
            fc=None if fc is None else sgrnas_control_fc,
        )

    return sgrna_sets
def project_score_data(sgrnas, subset=None):
    ddir = pkg_resources.resource_filename("crispy", "data/")

    score_manifest = pd.read_csv(
        f"{ddir}/crispr_manifests/project_score_manifest.csv.gz")

    s_map = []
    for i in score_manifest.index:
        s_map.append(
            pd.DataFrame(
                dict(
                    model_id=score_manifest.iloc[i]["model_id"],
                    s_ids=score_manifest.iloc[i]["library"].split(", "),
                    s_lib=score_manifest.iloc[i]
                    ["experiment_identifier"].split(", "),
                )))
    s_map = pd.concat(s_map).set_index("s_lib")

    if subset is not None:
        s_map = s_map[s_map["model_id"].isin(subset)]

    score_v1 = CRISPRDataSet("Yusa_v1")
    score_v1_fc = score_v1.counts.norm_rpm().foldchange(score_v1.plasmids)
    score_v1_fc = score_v1_fc.groupby(s_map["model_id"], axis=1).mean()

    score_v11 = CRISPRDataSet("Yusa_v1.1")
    score_v11_fc = score_v11.counts.norm_rpm().foldchange(score_v11.plasmids)
    score_v11_fc = score_v11_fc.groupby(s_map["model_id"], axis=1).mean()

    ess = set(score_v1.lib[score_v1.lib["Gene"].isin(
        Utils.get_essential_genes())].index)
    ness = set(score_v1.lib[score_v1.lib["Gene"].isin(
        Utils.get_non_essential_genes())].index)
    score_v1_fc = ReadCounts(score_v1_fc).scale(essential=ess,
                                                non_essential=ness)
    score_v11_fc = ReadCounts(score_v11_fc).scale(essential=ess,
                                                  non_essential=ness)

    score_fc = pd.concat([score_v1_fc.loc[sgrnas], score_v11_fc.loc[sgrnas]],
                         axis=1).dropna()

    return score_fc
示例#5
0
    def pr_curve(rank, true_set=None, false_set=None, min_events=10):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = rank[rank.index.isin(index_set)]

        if len(rank) == 0:
            return np.nan

        y_true = rank.index.isin(true_set).astype(int)

        if sum(y_true) < min_events:
            return np.nan

        return roc_auc_score(y_true, -rank)
示例#6
0
    def scale(self, essential=None, non_essential=None, metric=np.median):
        if essential is None:
            essential = Utils.get_essential_genes(return_series=False)

        if non_essential is None:
            non_essential = Utils.get_non_essential_genes(return_series=False)

        assert (
            len(essential.intersection(self.index)) != 0
        ), "DataFrame has no index overlapping with essential list"

        assert (
            len(non_essential.intersection(self.index)) != 0
        ), "DataFrame has no index overlapping with non essential list"

        essential_metric = metric(self.reindex(essential).dropna(), axis=0)
        non_essential_metric = metric(self.reindex(non_essential).dropna(), axis=0)

        return self.subtract(non_essential_metric).divide(
            non_essential_metric - essential_metric
        )
def define_controls(
    n_genes=3,
    cancer_type="Colorectal Carcinoma",
    cn_min=1,
    cn_max=5,
    crisp_min=-0.10,
    jacks_thres=0.25,
    offtarget=[1, 0, 0],
):
    # Samples
    samples = set(DataImporter.Sample().samplesheet.query(
        f"cancer_type == '{cancer_type}'").index)

    # Non-essential genes
    ness = Utils.get_non_essential_genes(return_series=False)
    ness = ness - set(Utils.get_sanger_essential()["Gene"])

    # Non-essential genes sgRNAs
    ness_sgrnas = pd.concat(
        [
            gselection.select_sgrnas(
                g, 2, jacks_thres=jacks_thres,
                offtarget=offtarget).assign(gene=g) for g in ness
        ],
        ignore_index=True,
    ).query("Library == 'KosukeYusa'")

    ness_sgrnas_fc = project_score_data(ness_sgrnas["sgRNA_ID"], samples)

    ness_sgrnas_fc_ds = ness_sgrnas_fc.T.describe().T.dropna()
    ness_sgrnas_fc_ds = ness_sgrnas_fc_ds[
        ness_sgrnas_fc_ds["25%"] >= crisp_min]
    ness_sgrnas_fc_ds["Approved_Symbol"] = (
        ness_sgrnas.set_index("sgRNA_ID").loc[ness_sgrnas_fc_ds.index,
                                              "Approved_Symbol"].values)

    ness_sgrnas = ness_sgrnas.set_index("sgRNA_ID").loc[
        ness_sgrnas_fc_ds.index]

    # Import different levels of information
    ddir = pkg_resources.resource_filename("crispy", "data/")

    cn = DataImporter.CopyNumber(
        f"{ddir}/copy_number/cnv_abs_copy_number_picnic_20191101.csv.gz"
    ).filter(subset=samples)

    hgnc = pd.read_csv(f"{DPATH}/protein-coding_gene.txt",
                       sep="\t",
                       index_col=1)

    # Control genes
    controls = ness_sgrnas.groupby("Approved_Symbol")["Library"].count()
    controls = list(controls[controls == 2].index)
    controls = pd.concat(
        [
            cn.reindex(controls).dropna().T.describe().T,
            hgnc.reindex(controls)["location"],
        ],
        axis=1,
        sort=False,
    ).dropna()
    controls = controls.query(f"(min >= {cn_min}) and (max <= {cn_max})")
    controls = controls.reset_index().rename(
        columns={"index": "Approved_Symbol"})
    controls = controls.merge(
        ness_sgrnas_fc_ds.reset_index(),
        on="Approved_Symbol",
        suffixes=("_cn", "_crispr"),
    )

    control_genes = list(
        controls.groupby("Approved_Symbol")["min_crispr"].mean().sort_values(
            ascending=False)[:n_genes].index)
    controls = controls[controls["Approved_Symbol"].isin(control_genes)]
    controls["location"] = hgnc.loc[controls["Approved_Symbol"],
                                    "location"].values

    control_guides = gselection.masterlib[
        gselection.masterlib["sgRNA_ID"].isin(
            controls["sgRNA"])].assign(Confidence="Control")[LIB_COLUMNS]

    return control_guides.sort_values("Approved_Symbol")
    col_colors=pd.Series(sample_pal)[plot_df.columns].rename("Library"),
    row_colors=pd.Series(sample_pal)[plot_df.index].rename("Library"),
    cbar_pos=None,
)

plt.savefig(f"{RPATH}/minlibcas9_screens_clustermap_gene_fc.pdf",
            bbox_inches="tight")
plt.close("all")

# Recall gene lists
#

gsets_aucs = {}
for n, gset in [
    ("essential", Utils.get_essential_genes()),
    ("non-essential", Utils.get_non_essential_genes()),
]:
    # Aroc
    plt.figure(figsize=(2, 2), dpi=600)
    ax = plt.gca()
    _, stats_ess = QCplot.plot_cumsum_auc(fc_gene[samples],
                                          gset,
                                          palette=sample_pal,
                                          legend_prop={"size": 4},
                                          ax=ax)
    plt.title(f"{n} recall curve")
    plt.xlabel("Percent-rank of genes")
    plt.ylabel("Cumulative fraction")
    plt.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="both")
    plt.savefig(f"{RPATH}/minlibcas9_screens_roccurves_{n}.pdf",
                bbox_inches="tight")