def precision_recall_curve(values, true_set=None, false_set=None, fdr_thres=0.01, return_curve=False): if true_set is None: true_set = Utils.get_essential_genes(return_series=False) if false_set is None: false_set = Utils.get_non_essential_genes(return_series=False) index_set = true_set.union(false_set) rank = values[values.index.isin(index_set)] y_true = rank.index.isin(true_set).astype(int) ap = average_precision_score(y_true, -rank) precision, recall, thres = precision_recall_curve(y_true, -rank) recall_fdr = recall[precision > (1 - fdr_thres)].max() res = ((ap, recall_fdr, precision, recall, thres) if return_curve else (ap, recall_fdr)) return res
def aroc_threshold(values, true_set=None, false_set=None, fpr_thres=0.01, return_curve=False): if true_set is None: true_set = Utils.get_essential_genes(return_series=False) if false_set is None: false_set = Utils.get_non_essential_genes(return_series=False) index_set = true_set.union(false_set) rank = values[values.index.isin(index_set)] y_true = rank.index.isin(true_set).astype(int) fpr, tpr, thres = roc_curve(y_true, -rank) auc_fpr = roc_auc_score(y_true, -rank, max_fpr=fpr_thres) if fpr_thres is not None: fc_thres_fpr = -min(thres[fpr <= fpr_thres]) else: fc_thres_fpr = None res = ((auc_fpr, fc_thres_fpr, fpr, tpr) if return_curve else (auc_fpr, fc_thres_fpr)) return res
def define_sgrnas_sets(clib, fc=None, add_controls=True, dataset_name="Yusa_v1"): sgrna_sets = dict() # sgRNA essential sgrnas_essential = Utils.get_essential_genes(return_series=False) sgrnas_essential = set(clib[clib["Gene"].isin(sgrnas_essential)].index) sgrnas_essential_fc = ( None if fc is None else fc.reindex(sgrnas_essential).median(1).dropna() ) sgrna_sets["essential"] = dict( color="#e6550d", sgrnas=sgrnas_essential, fc=sgrnas_essential_fc ) # sgRNA non-essential sgrnas_nonessential = Utils.get_non_essential_genes(return_series=False) sgrnas_nonessential = set(clib[clib["Gene"].isin(sgrnas_nonessential)].index) sgrnas_nonessential_fc = ( None if fc is None else fc.reindex(sgrnas_nonessential).median(1).dropna() ) sgrna_sets["nonessential"] = dict( color="#3182bd", sgrnas=sgrnas_nonessential, fc=sgrnas_nonessential_fc ) # sgRNA non-targeting if add_controls: if dataset_name in ["Yusa_v1", "Yusa v1", "Yusa_v1.1", "Yusa v1.1", "Sabatini_Lander_AML"]: sgrnas_control = {i for i in clib.index if i.startswith("CTRL0")} else: sgrnas_control = set( clib[[i.startswith("NO_CURRENT_") for i in clib["Gene"]]].index ) sgrnas_control_fc = fc.reindex(sgrnas_control).median(1).dropna() sgrna_sets["nontargeting"] = dict( color="#31a354", sgrnas=sgrnas_control, fc=None if fc is None else sgrnas_control_fc, ) return sgrna_sets
def project_score_data(sgrnas, subset=None): ddir = pkg_resources.resource_filename("crispy", "data/") score_manifest = pd.read_csv( f"{ddir}/crispr_manifests/project_score_manifest.csv.gz") s_map = [] for i in score_manifest.index: s_map.append( pd.DataFrame( dict( model_id=score_manifest.iloc[i]["model_id"], s_ids=score_manifest.iloc[i]["library"].split(", "), s_lib=score_manifest.iloc[i] ["experiment_identifier"].split(", "), ))) s_map = pd.concat(s_map).set_index("s_lib") if subset is not None: s_map = s_map[s_map["model_id"].isin(subset)] score_v1 = CRISPRDataSet("Yusa_v1") score_v1_fc = score_v1.counts.norm_rpm().foldchange(score_v1.plasmids) score_v1_fc = score_v1_fc.groupby(s_map["model_id"], axis=1).mean() score_v11 = CRISPRDataSet("Yusa_v1.1") score_v11_fc = score_v11.counts.norm_rpm().foldchange(score_v11.plasmids) score_v11_fc = score_v11_fc.groupby(s_map["model_id"], axis=1).mean() ess = set(score_v1.lib[score_v1.lib["Gene"].isin( Utils.get_essential_genes())].index) ness = set(score_v1.lib[score_v1.lib["Gene"].isin( Utils.get_non_essential_genes())].index) score_v1_fc = ReadCounts(score_v1_fc).scale(essential=ess, non_essential=ness) score_v11_fc = ReadCounts(score_v11_fc).scale(essential=ess, non_essential=ness) score_fc = pd.concat([score_v1_fc.loc[sgrnas], score_v11_fc.loc[sgrnas]], axis=1).dropna() return score_fc
def pr_curve(rank, true_set=None, false_set=None, min_events=10): if true_set is None: true_set = Utils.get_essential_genes(return_series=False) if false_set is None: false_set = Utils.get_non_essential_genes(return_series=False) index_set = true_set.union(false_set) rank = rank[rank.index.isin(index_set)] if len(rank) == 0: return np.nan y_true = rank.index.isin(true_set).astype(int) if sum(y_true) < min_events: return np.nan return roc_auc_score(y_true, -rank)
def scale(self, essential=None, non_essential=None, metric=np.median): if essential is None: essential = Utils.get_essential_genes(return_series=False) if non_essential is None: non_essential = Utils.get_non_essential_genes(return_series=False) assert ( len(essential.intersection(self.index)) != 0 ), "DataFrame has no index overlapping with essential list" assert ( len(non_essential.intersection(self.index)) != 0 ), "DataFrame has no index overlapping with non essential list" essential_metric = metric(self.reindex(essential).dropna(), axis=0) non_essential_metric = metric(self.reindex(non_essential).dropna(), axis=0) return self.subtract(non_essential_metric).divide( non_essential_metric - essential_metric )
def define_controls( n_genes=3, cancer_type="Colorectal Carcinoma", cn_min=1, cn_max=5, crisp_min=-0.10, jacks_thres=0.25, offtarget=[1, 0, 0], ): # Samples samples = set(DataImporter.Sample().samplesheet.query( f"cancer_type == '{cancer_type}'").index) # Non-essential genes ness = Utils.get_non_essential_genes(return_series=False) ness = ness - set(Utils.get_sanger_essential()["Gene"]) # Non-essential genes sgRNAs ness_sgrnas = pd.concat( [ gselection.select_sgrnas( g, 2, jacks_thres=jacks_thres, offtarget=offtarget).assign(gene=g) for g in ness ], ignore_index=True, ).query("Library == 'KosukeYusa'") ness_sgrnas_fc = project_score_data(ness_sgrnas["sgRNA_ID"], samples) ness_sgrnas_fc_ds = ness_sgrnas_fc.T.describe().T.dropna() ness_sgrnas_fc_ds = ness_sgrnas_fc_ds[ ness_sgrnas_fc_ds["25%"] >= crisp_min] ness_sgrnas_fc_ds["Approved_Symbol"] = ( ness_sgrnas.set_index("sgRNA_ID").loc[ness_sgrnas_fc_ds.index, "Approved_Symbol"].values) ness_sgrnas = ness_sgrnas.set_index("sgRNA_ID").loc[ ness_sgrnas_fc_ds.index] # Import different levels of information ddir = pkg_resources.resource_filename("crispy", "data/") cn = DataImporter.CopyNumber( f"{ddir}/copy_number/cnv_abs_copy_number_picnic_20191101.csv.gz" ).filter(subset=samples) hgnc = pd.read_csv(f"{DPATH}/protein-coding_gene.txt", sep="\t", index_col=1) # Control genes controls = ness_sgrnas.groupby("Approved_Symbol")["Library"].count() controls = list(controls[controls == 2].index) controls = pd.concat( [ cn.reindex(controls).dropna().T.describe().T, hgnc.reindex(controls)["location"], ], axis=1, sort=False, ).dropna() controls = controls.query(f"(min >= {cn_min}) and (max <= {cn_max})") controls = controls.reset_index().rename( columns={"index": "Approved_Symbol"}) controls = controls.merge( ness_sgrnas_fc_ds.reset_index(), on="Approved_Symbol", suffixes=("_cn", "_crispr"), ) control_genes = list( controls.groupby("Approved_Symbol")["min_crispr"].mean().sort_values( ascending=False)[:n_genes].index) controls = controls[controls["Approved_Symbol"].isin(control_genes)] controls["location"] = hgnc.loc[controls["Approved_Symbol"], "location"].values control_guides = gselection.masterlib[ gselection.masterlib["sgRNA_ID"].isin( controls["sgRNA"])].assign(Confidence="Control")[LIB_COLUMNS] return control_guides.sort_values("Approved_Symbol")
col_colors=pd.Series(sample_pal)[plot_df.columns].rename("Library"), row_colors=pd.Series(sample_pal)[plot_df.index].rename("Library"), cbar_pos=None, ) plt.savefig(f"{RPATH}/minlibcas9_screens_clustermap_gene_fc.pdf", bbox_inches="tight") plt.close("all") # Recall gene lists # gsets_aucs = {} for n, gset in [ ("essential", Utils.get_essential_genes()), ("non-essential", Utils.get_non_essential_genes()), ]: # Aroc plt.figure(figsize=(2, 2), dpi=600) ax = plt.gca() _, stats_ess = QCplot.plot_cumsum_auc(fc_gene[samples], gset, palette=sample_pal, legend_prop={"size": 4}, ax=ax) plt.title(f"{n} recall curve") plt.xlabel("Percent-rank of genes") plt.ylabel("Cumulative fraction") plt.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="both") plt.savefig(f"{RPATH}/minlibcas9_screens_roccurves_{n}.pdf", bbox_inches="tight")