def aroc_threshold(values, true_set=None, false_set=None, fpr_thres=0.01, return_curve=False): if true_set is None: true_set = Utils.get_essential_genes(return_series=False) if false_set is None: false_set = Utils.get_non_essential_genes(return_series=False) index_set = true_set.union(false_set) rank = values[values.index.isin(index_set)] y_true = rank.index.isin(true_set).astype(int) fpr, tpr, thres = roc_curve(y_true, -rank) auc_fpr = roc_auc_score(y_true, -rank, max_fpr=fpr_thres) if fpr_thres is not None: fc_thres_fpr = -min(thres[fpr <= fpr_thres]) else: fc_thres_fpr = None res = ((auc_fpr, fc_thres_fpr, fpr, tpr) if return_curve else (auc_fpr, fc_thres_fpr)) return res
def precision_recall_curve(values, true_set=None, false_set=None, fdr_thres=0.01, return_curve=False): if true_set is None: true_set = Utils.get_essential_genes(return_series=False) if false_set is None: false_set = Utils.get_non_essential_genes(return_series=False) index_set = true_set.union(false_set) rank = values[values.index.isin(index_set)] y_true = rank.index.isin(true_set).astype(int) ap = average_precision_score(y_true, -rank) precision, recall, thres = precision_recall_curve(y_true, -rank) recall_fdr = recall[precision > (1 - fdr_thres)].max() res = ((ap, recall_fdr, precision, recall, thres) if return_curve else (ap, recall_fdr)) return res
def recall_curve(rank, index_set=None, min_events=None): """ Calculate x and y of recall curve. :param rank: pandas.Series :param index_set: pandas.Series indices in rank :param min_events: int or None, optional Number of minimum number of index_set to calculate curve :return: """ x = rank.sort_values().dropna() # Observed cumsum if index_set is None: index_set = Utils.get_essential_genes(return_series=False) y = x.index.isin(index_set) if (min_events is not None) and (sum(y) < min_events): return None y = np.cumsum(y) / sum(y) # Rank fold-changes x = st.rankdata(x) / x.shape[0] # Calculate AUC xy_auc = auc(x, y) return x, y, xy_auc
def define_sgrnas_sets(clib, fc=None, add_controls=True, dataset_name="Yusa_v1"): sgrna_sets = dict() # sgRNA essential sgrnas_essential = Utils.get_essential_genes(return_series=False) sgrnas_essential = set(clib[clib["Gene"].isin(sgrnas_essential)].index) sgrnas_essential_fc = ( None if fc is None else fc.reindex(sgrnas_essential).median(1).dropna() ) sgrna_sets["essential"] = dict( color="#e6550d", sgrnas=sgrnas_essential, fc=sgrnas_essential_fc ) # sgRNA non-essential sgrnas_nonessential = Utils.get_non_essential_genes(return_series=False) sgrnas_nonessential = set(clib[clib["Gene"].isin(sgrnas_nonessential)].index) sgrnas_nonessential_fc = ( None if fc is None else fc.reindex(sgrnas_nonessential).median(1).dropna() ) sgrna_sets["nonessential"] = dict( color="#3182bd", sgrnas=sgrnas_nonessential, fc=sgrnas_nonessential_fc ) # sgRNA non-targeting if add_controls: if dataset_name in ["Yusa_v1", "Yusa v1", "Yusa_v1.1", "Yusa v1.1", "Sabatini_Lander_AML"]: sgrnas_control = {i for i in clib.index if i.startswith("CTRL0")} else: sgrnas_control = set( clib[[i.startswith("NO_CURRENT_") for i in clib["Gene"]]].index ) sgrnas_control_fc = fc.reindex(sgrnas_control).median(1).dropna() sgrna_sets["nontargeting"] = dict( color="#31a354", sgrnas=sgrnas_control, fc=None if fc is None else sgrnas_control_fc, ) return sgrna_sets
def filter( self, dtype="merged", subset=None, scale=True, std_filter=False, abs_thres=None, drop_core_essential=False, min_events=5, drop_core_essential_broad=False, binarise_thres=None, ): df = self.get_data(scale=True, dtype=dtype) # - Filters # Subset matrices if subset is not None: df = df.loc[:, df.columns.isin(subset)] # Filter by scaled scores if abs_thres is not None: df = df[(df.abs() > abs_thres).sum(1) >= min_events] # Filter out core essential genes if drop_core_essential: df = df[~df.index.isin(Utils.get_adam_core_essential())] if drop_core_essential_broad: df = df[~df.index.isin(Utils.get_broad_core_essential())] # - Subset matrices x = self.get_data(scale=scale, dtype=dtype).reindex( index=df.index, columns=df.columns ) if binarise_thres is not None: x = (x < binarise_thres).astype(int) if std_filter: x = x.reindex(x.std(1) > 0) return x
def project_score_data(sgrnas, subset=None): ddir = pkg_resources.resource_filename("crispy", "data/") score_manifest = pd.read_csv( f"{ddir}/crispr_manifests/project_score_manifest.csv.gz") s_map = [] for i in score_manifest.index: s_map.append( pd.DataFrame( dict( model_id=score_manifest.iloc[i]["model_id"], s_ids=score_manifest.iloc[i]["library"].split(", "), s_lib=score_manifest.iloc[i] ["experiment_identifier"].split(", "), ))) s_map = pd.concat(s_map).set_index("s_lib") if subset is not None: s_map = s_map[s_map["model_id"].isin(subset)] score_v1 = CRISPRDataSet("Yusa_v1") score_v1_fc = score_v1.counts.norm_rpm().foldchange(score_v1.plasmids) score_v1_fc = score_v1_fc.groupby(s_map["model_id"], axis=1).mean() score_v11 = CRISPRDataSet("Yusa_v1.1") score_v11_fc = score_v11.counts.norm_rpm().foldchange(score_v11.plasmids) score_v11_fc = score_v11_fc.groupby(s_map["model_id"], axis=1).mean() ess = set(score_v1.lib[score_v1.lib["Gene"].isin( Utils.get_essential_genes())].index) ness = set(score_v1.lib[score_v1.lib["Gene"].isin( Utils.get_non_essential_genes())].index) score_v1_fc = ReadCounts(score_v1_fc).scale(essential=ess, non_essential=ness) score_v11_fc = ReadCounts(score_v11_fc).scale(essential=ess, non_essential=ness) score_fc = pd.concat([score_v1_fc.loc[sgrnas], score_v11_fc.loc[sgrnas]], axis=1).dropna() return score_fc
def pr_curve(rank, true_set=None, false_set=None, min_events=10): if true_set is None: true_set = Utils.get_essential_genes(return_series=False) if false_set is None: false_set = Utils.get_non_essential_genes(return_series=False) index_set = true_set.union(false_set) rank = rank[rank.index.isin(index_set)] if len(rank) == 0: return np.nan y_true = rank.index.isin(true_set).astype(int) if sum(y_true) < min_events: return np.nan return roc_auc_score(y_true, -rank)
def scale(self, essential=None, non_essential=None, metric=np.median): if essential is None: essential = Utils.get_essential_genes(return_series=False) if non_essential is None: non_essential = Utils.get_non_essential_genes(return_series=False) assert ( len(essential.intersection(self.index)) != 0 ), "DataFrame has no index overlapping with essential list" assert ( len(non_essential.intersection(self.index)) != 0 ), "DataFrame has no index overlapping with non essential list" essential_metric = metric(self.reindex(essential).dropna(), axis=0) non_essential_metric = metric(self.reindex(non_essential).dropna(), axis=0) return self.subtract(non_essential_metric).divide( non_essential_metric - essential_metric )
def plot_rearrangements( cls, brass_bedpe, ascat_bed, crispy_bed, chrm, chrm_size=None, xlim=None, scale=1e6, show_legend=True, unfold_inversions=False, sv_alpha=1.0, sv_lw=0.3, highlight=None, mark_essential=False, ): # - Define default params chrm_size = Utils.CHR_SIZES_HG19 if chrm_size is None else chrm_size xlim = (0, chrm_size[chrm]) if xlim is None else xlim # - Build data-frames # BRASS brass_ = brass_bedpe[(brass_bedpe["chr1"] == chrm) | (brass_bedpe["chr2"] == chrm)] # ASCAT ascat_ = ascat_bed.query(f"chr == '{chrm}'") # CRISPR crispr_ = crispy_bed[crispy_bed["chr"] == chrm] crispr_ = crispr_.assign( location=crispr_[["sgrna_start", "sgrna_end"]].mean(1)) crispr_gene_ = crispr_.groupby("gene")[["fold_change", "location"]].mean() if brass_.shape[0] == 0: return None, None, None # - Plot f, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex="all", gridspec_kw={"height_ratios": [1, 2, 2]}) # Top panel ax1.axhline(0.0, lw=0.3, color=cls.PAL_DBGD[0]) ax1.set_ylim(-1, 1) # Middle panel for i, (_, s, e, cn) in ascat_.iterrows(): ax2.plot( (s / scale, e / scale), (cn, cn), alpha=1.0, c=cls.PAL_DBGD[2], zorder=3, label="ASCAT", lw=2, ) # Bottom panel ax3.scatter( crispr_["location"] / scale, crispr_["fold_change"], s=1, alpha=0.5, lw=0, c=cls.PAL_DBGD[1], label="CRISPR-Cas9", zorder=1, ) ax3.axhline(0.0, lw=0.3, color=cls.PAL_DBGD[0]) for (s, e), gp_mean in crispr_.groupby(["start", "end"])["fold_change"]: ax3.plot( (s / scale, e / scale), (gp_mean.mean(), gp_mean.mean()), alpha=1.0, c=cls.PAL_DBGD[2], zorder=3, label="Segment mean", lw=2, ) if mark_essential: ess = Utils.get_adam_core_essential() ax3.scatter( crispr_gene_.reindex(ess)["location"] / scale, crispr_gene_.reindex(ess)["fold_change"], s=5, marker="x", lw=0.3, c=cls.PAL_DBGD[1], alpha=0.4, edgecolors="#fc8d62", label="Core-essential", ) # Highlight if highlight is not None: for ic, i in zip( *(sns.color_palette("tab20", n_colors=len(highlight)), highlight)): if i in crispr_.index: ax3.scatter( crispr_.loc[i, "location"] / scale, crispr_.loc[i]["fold_change"], s=14, marker="X", lw=0, c=ic, alpha=0.9, label=i, ) # for c1, s1, e1, c2, s2, e2, st1, st2, sv in brass_[[ "chr1", "start1", "end1", "chr2", "start2", "end2", "strand1", "strand2", "svclass", ]].values: stype = Utils.svtype(st1, st2, sv, unfold_inversions) stype_col = cls.SV_PALETTE[stype] zorder = 2 if stype == "tandem-duplication" else 1 x1_mean, x2_mean = np.mean([s1, e1]), np.mean([s2, e2]) # Plot arc if c1 == c2: angle = 0 if stype in ["tandem-duplication", "deletion" ] else 180 xy = (np.mean([x1_mean, x2_mean]) / scale, 0) ax1.add_patch( Arc( xy, (x2_mean - x1_mean) / scale, 1.0, angle=angle, theta1=0, theta2=180, edgecolor=stype_col, lw=sv_lw, zorder=zorder, alpha=sv_alpha, )) # Plot segments for ymin, ymax, ax in [(-1, 0.5, ax1), (-1, 1, ax2), (0, 1, ax3)]: if (c1 == chrm) and (xlim[0] <= x1_mean <= xlim[1]): ax.axvline( x=x1_mean / scale, ymin=ymin, ymax=ymax, c=stype_col, linewidth=sv_lw, zorder=zorder, clip_on=False, label=stype, alpha=sv_alpha, ) if (c2 == chrm) and (xlim[0] <= x2_mean <= xlim[1]): ax.axvline( x=x2_mean / scale, ymin=ymin, ymax=ymax, c=stype_col, linewidth=sv_lw, zorder=zorder, clip_on=False, label=stype, alpha=sv_alpha, ) # Translocation label if stype == "translocation": if (c1 == chrm) and (xlim[0] <= x1_mean <= xlim[1]): ax1.text( x1_mean / scale, 0, " to {}".format(c2), color=stype_col, ha="center", fontsize=5, rotation=90, va="bottom", ) if (c2 == chrm) and (xlim[0] <= x2_mean <= xlim[1]): ax1.text( x2_mean / scale, 0, " to {}".format(c1), color=stype_col, ha="center", fontsize=5, rotation=90, va="bottom", ) # if show_legend: by_label = { l.capitalize(): p for p, l in zip(*(ax2.get_legend_handles_labels())) if l in cls.SV_PALETTE } ax1.legend( by_label.values(), by_label.keys(), loc="center left", bbox_to_anchor=(1.02, 0.5), prop={"size": 6}, frameon=False, ) by_label = { l: p for p, l in zip(*(ax2.get_legend_handles_labels())) if l not in cls.SV_PALETTE } ax2.legend( by_label.values(), by_label.keys(), loc="center left", bbox_to_anchor=(1.02, 0.5), prop={"size": 6}, frameon=False, ) by_label = { l: p for p, l in zip(*(ax3.get_legend_handles_labels())) if l not in cls.SV_PALETTE } ax3.legend( by_label.values(), by_label.keys(), loc="center left", bbox_to_anchor=(1.02, 0.5), prop={"size": 6}, frameon=False, ) # ax1.axis("off") # ax2.set_ylim(0, np.ceil(ascat_["copy_number"].quantile(0.9999) + 0.5)) # ax2.yaxis.set_major_locator(plticker.MultipleLocator(base=2.0)) ax3.yaxis.set_major_locator(plticker.MultipleLocator(base=1.0)) # ax2.tick_params(axis="both", which="major", labelsize=6) ax3.tick_params(axis="both", which="major", labelsize=6) # ax1.set_ylabel("SV") ax2.set_ylabel("Copy-number", fontsize=7) ax3.set_ylabel("Loss of fitness", fontsize=7) # plt.xlabel("Position on chromosome {} (Mb)".format( chrm.replace("chr", ""))) # plt.xlim(xlim[0] / scale, xlim[1] / scale) return ax1, ax2, ax3
def plot_chromosome( cls, crispy_bed, ascat_bed, chrm, y_var="fold_change", highlight=None, ax=None, legend=False, scale=1e6, tick_base=1, legend_size=5, ): if ax is None: ax = plt.gca() # - Build data-frames # ASCAT ascat_ = ascat_bed.query(f"Chr == '{chrm}'") # CRISPR crispr_ = crispy_bed[crispy_bed["Chr"] == chrm] crispr_ = crispr_.assign( location=crispr_[["sgRNA_Start", "sgRNA_End"]].mean(1)) crispr_gene_ = crispr_.groupby("gene")[[y_var, "location"]].mean() # Plot original values ax.scatter( crispr_["location"] / scale, crispr_[y_var], s=6, marker=".", lw=0, c=cls.PAL_DBGD[1], alpha=0.4, label="CRISPR-Cas9", ) # Segment mean for (s, e), gp_mean in crispr_.groupby(["Start", "End"])[y_var]: ax.plot( (s / scale, e / scale), (gp_mean.mean(), gp_mean.mean()), alpha=1.0, c=cls.PAL_DBGD[2], zorder=3, label="CRISPR-Cas9 segment mean", lw=2, ) # Plot segments for s, e, cn in ascat_[["Start", "End", "copy_number"]].values: ax.plot( (s / scale, e / scale), (cn, cn), alpha=1.0, c=cls.PAL_DBGD[0], zorder=3, label="Copy-number segment", lw=2, ) # Highlight if highlight is not None: for ic, i in zip( *(sns.color_palette("tab20", n_colors=len(highlight)), highlight)): if i in crispr_gene_.index: ax.scatter( crispr_gene_["location"].loc[i] / scale, crispr_gene_[y_var].loc[i], s=14, marker="X", lw=0, c=ic, alpha=0.9, label=i, ) # Misc ax.axhline(0, lw=0.3, ls="-", color="black") # Cytobads cytobands = Utils.get_cytobands(chrm=chrm) for i, (s, e, t) in enumerate(cytobands[["Start", "End", "band"]].values): if t == "acen": ax.axvline(s / scale, lw=0.2, ls="-", color=cls.PAL_DBGD[0], alpha=0.1) ax.axvline(e / scale, lw=0.2, ls="-", color=cls.PAL_DBGD[0], alpha=0.1) elif not i % 2: ax.axvspan(s / scale, e / scale, alpha=0.1, facecolor=cls.PAL_DBGD[0]) # Legend if legend: handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend() ax.legend( by_label.values(), by_label.keys(), loc="center left", bbox_to_anchor=(1, 0.5), prop={"size": legend_size}, frameon=False, ) ax.set_xlim(crispr_["Start"].min() / scale, crispr_["End"].max() / scale) ax.tick_params(axis="both", which="major", labelsize=5) ax.yaxis.set_major_locator(plticker.MultipleLocator(base=tick_base)) return ax
def define_controls( n_genes=3, cancer_type="Colorectal Carcinoma", cn_min=1, cn_max=5, crisp_min=-0.10, jacks_thres=0.25, offtarget=[1, 0, 0], ): # Samples samples = set(DataImporter.Sample().samplesheet.query( f"cancer_type == '{cancer_type}'").index) # Non-essential genes ness = Utils.get_non_essential_genes(return_series=False) ness = ness - set(Utils.get_sanger_essential()["Gene"]) # Non-essential genes sgRNAs ness_sgrnas = pd.concat( [ gselection.select_sgrnas( g, 2, jacks_thres=jacks_thres, offtarget=offtarget).assign(gene=g) for g in ness ], ignore_index=True, ).query("Library == 'KosukeYusa'") ness_sgrnas_fc = project_score_data(ness_sgrnas["sgRNA_ID"], samples) ness_sgrnas_fc_ds = ness_sgrnas_fc.T.describe().T.dropna() ness_sgrnas_fc_ds = ness_sgrnas_fc_ds[ ness_sgrnas_fc_ds["25%"] >= crisp_min] ness_sgrnas_fc_ds["Approved_Symbol"] = ( ness_sgrnas.set_index("sgRNA_ID").loc[ness_sgrnas_fc_ds.index, "Approved_Symbol"].values) ness_sgrnas = ness_sgrnas.set_index("sgRNA_ID").loc[ ness_sgrnas_fc_ds.index] # Import different levels of information ddir = pkg_resources.resource_filename("crispy", "data/") cn = DataImporter.CopyNumber( f"{ddir}/copy_number/cnv_abs_copy_number_picnic_20191101.csv.gz" ).filter(subset=samples) hgnc = pd.read_csv(f"{DPATH}/protein-coding_gene.txt", sep="\t", index_col=1) # Control genes controls = ness_sgrnas.groupby("Approved_Symbol")["Library"].count() controls = list(controls[controls == 2].index) controls = pd.concat( [ cn.reindex(controls).dropna().T.describe().T, hgnc.reindex(controls)["location"], ], axis=1, sort=False, ).dropna() controls = controls.query(f"(min >= {cn_min}) and (max <= {cn_max})") controls = controls.reset_index().rename( columns={"index": "Approved_Symbol"}) controls = controls.merge( ness_sgrnas_fc_ds.reset_index(), on="Approved_Symbol", suffixes=("_cn", "_crispr"), ) control_genes = list( controls.groupby("Approved_Symbol")["min_crispr"].mean().sort_values( ascending=False)[:n_genes].index) controls = controls[controls["Approved_Symbol"].isin(control_genes)] controls["location"] = hgnc.loc[controls["Approved_Symbol"], "location"].values control_guides = gselection.masterlib[ gselection.masterlib["sgRNA_ID"].isin( controls["sgRNA"])].assign(Confidence="Control")[LIB_COLUMNS] return control_guides.sort_values("Approved_Symbol")
lw=0.05, col_colors=pd.Series(sample_pal)[plot_df.columns].rename("Library"), row_colors=pd.Series(sample_pal)[plot_df.index].rename("Library"), cbar_pos=None, ) plt.savefig(f"{RPATH}/minlibcas9_screens_clustermap_gene_fc.pdf", bbox_inches="tight") plt.close("all") # Recall gene lists # gsets_aucs = {} for n, gset in [ ("essential", Utils.get_essential_genes()), ("non-essential", Utils.get_non_essential_genes()), ]: # Aroc plt.figure(figsize=(2, 2), dpi=600) ax = plt.gca() _, stats_ess = QCplot.plot_cumsum_auc(fc_gene[samples], gset, palette=sample_pal, legend_prop={"size": 4}, ax=ax) plt.title(f"{n} recall curve") plt.xlabel("Percent-rank of genes") plt.ylabel("Cumulative fraction") plt.grid(True, ls=":", lw=0.1, alpha=1.0, zorder=0, axis="both") plt.savefig(f"{RPATH}/minlibcas9_screens_roccurves_{n}.pdf",
cnv = cnv_obj.filter(subset=list(prot)) cnv_norm = np.log2(cnv.divide(prot_obj.ss.loc[cnv.columns, "ploidy"]) + 1) LOG.info(f"Copy number: {cnv.shape}") # Overlaps # samples = list(set.intersection(set(prot), set(gexp), set(cnv))) genes = list( set.intersection(set(prot.index), set(gexp.index), set(cnv.index), set(prot_broad.index))) LOG.info(f"Genes: {len(genes)}; Samples: {len(samples)}") # Data tranformations # gexp_t = pd.DataFrame( {i: Utils.gkn(gexp.loc[i].dropna()).to_dict() for i in genes}).T ## # s_corr = pd.DataFrame({ s1: { s2: two_vars_correlation(prot[s1], gexp[s2])["corr"] for s2 in samples } for s1 in samples }) s_corr.to_csv( "/Users/Downloads/Proteomics_Transcriptomics_Corr_Matrix.csv") # Sample-wise Protein/Gene correlation with CopyNumber - Attenuation
stromal_count != 1].index)] # Import proteomics data-sets # dmatrix, ms_type, ctypes = [], [], [] for ctype, dfile in CPTAC_DATASETS: df = pd.read_csv(f"{CPTAC_DPATH}/linkedomics/{dfile}", sep="\t", index_col=0) if "COADREAD" in dfile: df = df.replace(0, np.nan) df = df.pipe(np.log2) df = pd.DataFrame( {i: Utils.gkn(df.loc[i].dropna()).to_dict() for i in df.index}).T # Simplify barcode df.columns = [i[:12].replace(".", "-") for i in df] # Cancer type ctypes.append(pd.Series(ctype, index=df.columns)) # MS type ms_type.append( pd.Series("LF" if "COADREAD" in dfile else "TMT", index=df.columns)) dmatrix.append(df)