Пример #1
0
    def recall_curve(rank, index_set=None, min_events=None):
        """
        Calculate x and y of recall curve.

        :param rank: pandas.Series

        :param index_set: pandas.Series
            indices in rank

        :param min_events: int or None, optional
            Number of minimum number of index_set to calculate curve

        :return:
        """
        x = rank.sort_values().dropna()

        # Observed cumsum
        if index_set is None:
            index_set = Utils.get_essential_genes(return_series=False)

        y = x.index.isin(index_set)

        if (min_events is not None) and (sum(y) < min_events):
            return None

        y = np.cumsum(y) / sum(y)

        # Rank fold-changes
        x = st.rankdata(x) / x.shape[0]

        # Calculate AUC
        xy_auc = auc(x, y)

        return x, y, xy_auc
Пример #2
0
    def aroc_threshold(values,
                       true_set=None,
                       false_set=None,
                       fpr_thres=0.01,
                       return_curve=False):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = values[values.index.isin(index_set)]
        y_true = rank.index.isin(true_set).astype(int)

        fpr, tpr, thres = roc_curve(y_true, -rank)

        auc_fpr = roc_auc_score(y_true, -rank, max_fpr=fpr_thres)

        if fpr_thres is not None:
            fc_thres_fpr = -min(thres[fpr <= fpr_thres])

        else:
            fc_thres_fpr = None

        res = ((auc_fpr, fc_thres_fpr, fpr, tpr) if return_curve else
               (auc_fpr, fc_thres_fpr))

        return res
Пример #3
0
    def precision_recall_curve(values,
                               true_set=None,
                               false_set=None,
                               fdr_thres=0.01,
                               return_curve=False):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = values[values.index.isin(index_set)]
        y_true = rank.index.isin(true_set).astype(int)

        ap = average_precision_score(y_true, -rank)

        precision, recall, thres = precision_recall_curve(y_true, -rank)
        recall_fdr = recall[precision > (1 - fdr_thres)].max()

        res = ((ap, recall_fdr, precision, recall, thres) if return_curve else
               (ap, recall_fdr))

        return res
Пример #4
0
def define_sgrnas_sets(clib, fc=None, add_controls=True, dataset_name="Yusa_v1"):
    sgrna_sets = dict()

    # sgRNA essential
    sgrnas_essential = Utils.get_essential_genes(return_series=False)
    sgrnas_essential = set(clib[clib["Gene"].isin(sgrnas_essential)].index)
    sgrnas_essential_fc = (
        None if fc is None else fc.reindex(sgrnas_essential).median(1).dropna()
    )

    sgrna_sets["essential"] = dict(
        color="#e6550d", sgrnas=sgrnas_essential, fc=sgrnas_essential_fc
    )

    # sgRNA non-essential
    sgrnas_nonessential = Utils.get_non_essential_genes(return_series=False)
    sgrnas_nonessential = set(clib[clib["Gene"].isin(sgrnas_nonessential)].index)
    sgrnas_nonessential_fc = (
        None if fc is None else fc.reindex(sgrnas_nonessential).median(1).dropna()
    )

    sgrna_sets["nonessential"] = dict(
        color="#3182bd", sgrnas=sgrnas_nonessential, fc=sgrnas_nonessential_fc
    )

    # sgRNA non-targeting
    if add_controls:
        if dataset_name in ["Yusa_v1", "Yusa v1", "Yusa_v1.1", "Yusa v1.1", "Sabatini_Lander_AML"]:
            sgrnas_control = {i for i in clib.index if i.startswith("CTRL0")}
        else:
            sgrnas_control = set(
                clib[[i.startswith("NO_CURRENT_") for i in clib["Gene"]]].index
            )

        sgrnas_control_fc = fc.reindex(sgrnas_control).median(1).dropna()

        sgrna_sets["nontargeting"] = dict(
            color="#31a354",
            sgrnas=sgrnas_control,
            fc=None if fc is None else sgrnas_control_fc,
        )

    return sgrna_sets
Пример #5
0
def project_score_data(sgrnas, subset=None):
    ddir = pkg_resources.resource_filename("crispy", "data/")

    score_manifest = pd.read_csv(
        f"{ddir}/crispr_manifests/project_score_manifest.csv.gz")

    s_map = []
    for i in score_manifest.index:
        s_map.append(
            pd.DataFrame(
                dict(
                    model_id=score_manifest.iloc[i]["model_id"],
                    s_ids=score_manifest.iloc[i]["library"].split(", "),
                    s_lib=score_manifest.iloc[i]
                    ["experiment_identifier"].split(", "),
                )))
    s_map = pd.concat(s_map).set_index("s_lib")

    if subset is not None:
        s_map = s_map[s_map["model_id"].isin(subset)]

    score_v1 = CRISPRDataSet("Yusa_v1")
    score_v1_fc = score_v1.counts.norm_rpm().foldchange(score_v1.plasmids)
    score_v1_fc = score_v1_fc.groupby(s_map["model_id"], axis=1).mean()

    score_v11 = CRISPRDataSet("Yusa_v1.1")
    score_v11_fc = score_v11.counts.norm_rpm().foldchange(score_v11.plasmids)
    score_v11_fc = score_v11_fc.groupby(s_map["model_id"], axis=1).mean()

    ess = set(score_v1.lib[score_v1.lib["Gene"].isin(
        Utils.get_essential_genes())].index)
    ness = set(score_v1.lib[score_v1.lib["Gene"].isin(
        Utils.get_non_essential_genes())].index)
    score_v1_fc = ReadCounts(score_v1_fc).scale(essential=ess,
                                                non_essential=ness)
    score_v11_fc = ReadCounts(score_v11_fc).scale(essential=ess,
                                                  non_essential=ness)

    score_fc = pd.concat([score_v1_fc.loc[sgrnas], score_v11_fc.loc[sgrnas]],
                         axis=1).dropna()

    return score_fc
Пример #6
0
    def pr_curve(rank, true_set=None, false_set=None, min_events=10):
        if true_set is None:
            true_set = Utils.get_essential_genes(return_series=False)

        if false_set is None:
            false_set = Utils.get_non_essential_genes(return_series=False)

        index_set = true_set.union(false_set)

        rank = rank[rank.index.isin(index_set)]

        if len(rank) == 0:
            return np.nan

        y_true = rank.index.isin(true_set).astype(int)

        if sum(y_true) < min_events:
            return np.nan

        return roc_auc_score(y_true, -rank)
Пример #7
0
    def scale(self, essential=None, non_essential=None, metric=np.median):
        if essential is None:
            essential = Utils.get_essential_genes(return_series=False)

        if non_essential is None:
            non_essential = Utils.get_non_essential_genes(return_series=False)

        assert (
            len(essential.intersection(self.index)) != 0
        ), "DataFrame has no index overlapping with essential list"

        assert (
            len(non_essential.intersection(self.index)) != 0
        ), "DataFrame has no index overlapping with non essential list"

        essential_metric = metric(self.reindex(essential).dropna(), axis=0)
        non_essential_metric = metric(self.reindex(non_essential).dropna(), axis=0)

        return self.subtract(non_essential_metric).divide(
            non_essential_metric - essential_metric
        )
    lw=0.05,
    col_colors=pd.Series(sample_pal)[plot_df.columns].rename("Library"),
    row_colors=pd.Series(sample_pal)[plot_df.index].rename("Library"),
    cbar_pos=None,
)

plt.savefig(f"{RPATH}/minlibcas9_screens_clustermap_gene_fc.pdf",
            bbox_inches="tight")
plt.close("all")

# Recall gene lists
#

gsets_aucs = {}
for n, gset in [
    ("essential", Utils.get_essential_genes()),
    ("non-essential", Utils.get_non_essential_genes()),
]:
    # Aroc
    plt.figure(figsize=(2, 2), dpi=600)
    ax = plt.gca()
    _, stats_ess = QCplot.plot_cumsum_auc(fc_gene[samples],
                                          gset,
                                          palette=sample_pal,
                                          legend_prop={"size": 4},
                                          ax=ax)
    plt.title(f"{n} recall curve")
    plt.xlabel("Percent-rank of genes")
    plt.ylabel("Cumulative fraction")
    plt.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="both")
    plt.savefig(f"{RPATH}/minlibcas9_screens_roccurves_{n}.pdf",