def plot_best_runs_pr(best_pr, all_pr, run_info, filters, **kwargs): df = best_pr.sort_values(by=["prec", "recall"], ascending=[False, False]) z = kwargs.get("_zplots_context", None) or ZPlots() title = f"PR curves, protein identification ({len(df.pro_i.unique())} proteins), best runs ({filters.classifier})." with z( f_title=title, _merge=True, _legend="bottom_right", f_y_axis_label="precision", f_x_axis_label="read recall", ): color_by_run = len(run_info.run_iz) > 1 groups = df.groupby("run_i") for run_i, run_label in zip(run_info.run_iz, run_info.run_labels): group = groups.get_group(run_i) if color_by_run: color = z.next() for i, row in group.iterrows(): if not color_by_run: color = z.next() pep_i = row.pep_i pro_id = row.pro_id legend_label = f"{run_label} {pro_id}" line_label = f"{pro_id} pep{row.pep_i:03d} {row.seqstr} {row.flustr}" prdf = all_pr[(all_pr.run_i == run_i) & (all_pr.pep_i == pep_i)] prsa = (prdf.prec.values, prdf.recall.values, prdf.score.values, None) plots._plot_pr_curve( prsa, color=color, legend_label=legend_label, _label=line_label, _zplots_context=z, **kwargs, )
def plot_best_runs_pr(best_pr, all_pr, run_info, filters, **kwargs): df = best_pr.sort_values(by=["prec", "recall"], ascending=[False, False])[:filters.plot_n_peps] z = kwargs.get("_zplots_context", None) or ZPlots() z.color_reset() title = f"PR curves, best {len(df.pep_i.unique())} peptides, best runs. {filters.classifier} " run_i_to_info = { run_i: (run_label, z.next()) for run_i, run_label in zip(run_info.run_iz, run_info.run_labels) } with z( f_title=title, _merge=True, _legend="bottom_right", f_y_axis_label="precision", f_x_axis_label="read recall", ): for i, row in df.iterrows(): run_i = row.run_i pep_i = row.pep_i legend_label = f"{run_i_to_info[run_i][0]} p{pep_i}" line_label = f"{row.pep_i:03d} {row.seqstr} {row.flustr} ({row.flu_count})" color = run_i_to_info[run_i][1] prdf = all_pr[(all_pr.run_i == run_i) & (all_pr.pep_i == pep_i)] prsa = (prdf.prec.values, prdf.recall.values, prdf.score.values, None) plots._plot_pr_curve( prsa, color=color, legend_label=legend_label, _label=line_label, **kwargs, )
def plot_sigproc_stats(run): # Hist quality, peaks per field, background, etc. # Maybe done as rows per field heatmap z = ZPlots() with z(_cols=4, f_plot_width=250, f_plot_height=280): fields_df = run.sigproc_v1.fields() z.hist( fields_df.quality, f_x_axis_label="quality", f_y_axis_label="n_frames", _bins=np.linspace(0, 400, 100), ) by_quality = run.sigproc_v1.fields().sort_values(by="quality") def get(i): row = by_quality.iloc[i] im = run.sigproc_v1.raw_im(row.field_i, row.channel_i, row.cycle_i) return im, row best_im, best = get(-1) worst_im, worst = get(0) median_im, median = get(run.sigproc_v1.n_frames // 2) cspan = (0, np.percentile(median_im.flatten(), q=99, axis=0)) z.im( best_im, _cspan=cspan, f_title= f"Best field={best.field_i} channel={best.channel_i} cycle={best.cycle_i}", ) z.im( median_im, _cspan=cspan, f_title= f"Median field={median.field_i} channel={median.channel_i} cycle={median.cycle_i}", ) z.im( worst_im, _cspan=cspan, f_title= f"Worst field={worst.field_i} channel={worst.channel_i} cycle={worst.cycle_i}", )
def plot_psfs(psfs, scale=1.0, **kwargs): divs_h, divs_w, dim_h, dim_w = psfs.shape assert divs_h == divs_w divs = divs_h assert dim_h == dim_w dim = dim_h z = kwargs.pop("_zplots_context", None) or ZPlots() with z(_size=kwargs.get("_size", max(100, int(dim * divs * scale)))): comp = np.zeros((divs * dim, divs * dim)) for y_i, x_i in itertools.product(range(divs), range(divs)): comp[y_i * dim : (y_i + 1) * dim, x_i * dim : (x_i + 1) * dim] = psfs[ y_i, x_i ] z.im(comp, **kwargs)
def plot_pr_breakout_peps_runs(job, peps_runs_df, filters, **kwargs): # TODO: see similar in plots_dev_mhc.py when plotting from df # This is only being called by PTM template at the moment I think. # Move to dev_ptm module """ Single plot of PR curves for run_i+pep_i pairs given in peps_runs_df job: the JobResult that contains all of the RunResult objects indexed by run_i peps_runs_df: a df containing run_i,run_name,pep_i,ptm per row whose PR should be plotted """ z = kwargs.pop("_zplots_context", None) or ZPlots() z.color_reset() n_peps = len(peps_runs_df.pep_i.unique()) title = kwargs.pop( "f_title", f"PR curves, {n_peps} peptides, best runs ({filters.classifier})") with z( _merge=True, f_y_axis_label="precision", f_x_axis_label="read recall", f_title=title, _legend="bottom_right", **kwargs, ): # first pass to collect data so it can be sorted and affect legend order pr_data = [] for (run_i, pep_i), row in peps_runs_df.groupby(["run_i", "pep_i"]): run_name = row.run_name.iloc[0] ptms = ";".join(list(row.ptm.astype(str))) ptms = f"({ptms})" if ptms else "" name = f'r{run_i}p{pep_i}{ptms}{"_".join(run_name.split("_")[:-1])}' # tuple[3] is to allow sort on PTM,reverse-precision pr_data += [(run_i, pep_i, name, f"{row.ptm.iloc[0]}prec{1-row.prec.iloc[0]:.4f}")] pr_data.sort(key=lambda tup: tup[3]) # second pass to plot data for (run_i, pep_i, name, first_ptm) in pr_data: plots.plot_pr_breakout( job.runs[run_i], pep_iz=[pep_i], color=z.next(), legend_label=name, _noise=0.005, _zplots_context=z, )
def plot_best_runs_peptide_observability(job, best_pr, run_info, all_runs_pr, filters, **kwargs): """ peptide observability-vs-precision considering the best runs for each peptide as if they are one big super-run -- how well can we see peptides vs precision if the best run (as defined by filters) is chosen for each peptide? """ z = kwargs.get("_zplots_context", None) or ZPlots() z.color_reset() classifier_name = filters.get("classifier", "").upper() with z( _merge=True, f_title= f"Peptide-Classes Precision/Recall (best {filters.plot_n_runs} + combined-best runs) ({classifier_name})", f_y_axis_label="precision", f_x_axis_label="peptide-classes recall", ): best_runs_full_pr = [] for i, (run_i, n_pep, peps) in enumerate( zip(run_info.run_iz, run_info.pep_counts, run_info.peps)): best_runs_full_pr += [ all_runs_pr[(all_runs_pr.run_i == run_i) & (all_runs_pr.pep_i.isin(peps))] ] run = job.runs[run_i] if i < filters.plot_n_runs: label = f"{plots_dev._run_labels(run.run_name)} ({n_pep})" plots.plot_peptide_observability_vs_precision( run, pep_iz=filters.peptide_subset, color=z.next(), pr_axes=True, _label=label, legend_label=label, _legend="top_right", _range=(0, 1.05, 0, 1.05), _zplots_context=z, ) best_full_pr = pd.concat(best_runs_full_pr) plots._plot_peptide_observability_vs_precision( best_full_pr, color=z.next(), _label="combined", legend_label="combined (filters)", **kwargs, )
def plot_pr_scatter_peps_runs(peps_runs_df, run_info, **kwargs): """ Single plot of best PR for run_i+pep_i pairs given in peps_runs_df peps_runs_df: a df containing run_i,run_name,pep_i,prec, and recall run_info: a Munch containing run_iz,run_labels,pep_counts,and peps per run, sorted """ df = peps_runs_df.copy() df["label"] = df.apply( lambda x: f"{x.pep_i:03d} {x.seqstr} {x.flustr} ({x.flu_count})", axis=1) z = kwargs.pop("_zplots_context", None) or ZPlots() n_peps = len(peps_runs_df.pep_i.unique()) title = kwargs.get("f_title", f"{n_peps} peptides, best precision for recall-filter") z.color_reset() with z( _merge=True, f_y_axis_label="precision", f_x_axis_label="read recall", f_title=title, _legend="bottom_right", _range=(0, 1.05, 0, 1.05), ): groups = df.groupby("run_i") for run_i, run_label, pep_count in zip(run_info.run_iz, run_info.run_labels, run_info.pep_counts): try: group = groups.get_group(run_i) except KeyError: continue # the run has no entries in the DF, that's ok. legend = f"{run_label} ({pep_count})" z.scat( source=group, y="prec", x="recall", _label="label", fill_alpha=0.8, color=z.next(), legend_label=legend, )
def plot_best_runs_peptide_yield(best_pr, run_info, filters, **kwargs): """ For each run, indicate how many peptides it was the 'best run' for based on filter criteria. """ total_peps = len(best_pr.pep_i.unique()) fracs_by_run = run_info.pep_counts / total_peps classifier_name = filters.get("classifier", "").upper() title = f'"Best PR" peptide-yield for runs that produced a best peptide-pr ({classifier_name})' y_label = "fraction of total peptides" x_range = run_info.run_labels z = kwargs.get("_zplots_context", None) or ZPlots() z.cols( fracs_by_run, x=x_range, f_x_range=x_range, _x_axis_label_orientation=1.2, f_title=title, f_y_axis_label=y_label, f_x_axis_label="run name", _label=run_info.pep_counts, _size_x=1000, )
def plot_confusion_matrix_compare( run, pep_i, score_threshold, classifier=None, ): """ Plot two confusion matrices - one with all calls, and one with calls culled according to score_threshold. Display precision and recall for pep_i in the title of the comparison plots. classifier: None to use any available preferred classifier, or one of the supported classifiers in RunResult::get_available_classifiers(), e.g. 'rf', 'nn' """ cb = run.test_call_bag(classifier=classifier) z = ZPlots() with z(_cols=2, f_x_axis_label="true pep_i"): def pr(cm, p_i): prec = np_safe_divide(cm[p_i, p_i], np.sum(cm[p_i, :])) recall = np_safe_divide(cm[p_i, p_i], np.sum(cm[:, p_i])) return prec, recall conf_mat = cb.conf_mat() prec, recall = pr(conf_mat, pep_i) z.im( np.array(conf_mat), _size=500, f_title= f"{run.run_name}: pep_i={pep_i} precision={prec:.2f} recall={recall:.2f} {cb.classifier_name}", f_y_axis_label="pep_i predicted by classifier", ) conf_mat = cb.conf_mat_at_score_threshold(score_threshold) prec, recall = pr(conf_mat, pep_i) z.im( np.array(conf_mat), _size=500, f_title= f"precision={prec:.2f} recall={recall:.2f} score={score_threshold:.2f} {cb.classifier_name}", )
def _raw_peak_i_zoom( field_i, res, df, peak_i, channel=0, zoom=3.0, square_radius=7, x_pad=0, cspan=(0, 5_000), separate=False, show_circles=True, ): peak_i = int(peak_i) peak_records = df[df.peak_i == peak_i] field_i = int(peak_records.iloc[0].field_i) im = res.raw_chcy_ims(field_i) all_sig = res.sig() square = cspan[1] * imops.generate_square_mask(square_radius) sig_for_channel = all_sig[peak_i, channel, :] sig_top = np.median(sig_for_channel) + np.percentile(sig_for_channel, 99.9) height = (square_radius + 1) * 2 + 1 one_width = height + x_pad all_width = one_width * res.n_cycles - x_pad im_all_cycles = np.zeros((height, all_width)) f_plot_height = height * zoom f_plot_width = all_width * zoom z = ZPlots() if show_circles: for cycle_i in range(res.n_cycles): im_with_marker = np.copy(im[channel, cycle_i]) cy_rec = peak_records[peak_records.cycle_i == cycle_i].iloc[0] loc = XY(cy_rec.raw_x, cy_rec.raw_y) imops.accum_inplace(im_with_marker, square, loc=loc, center=True) im_with_marker = imops.extract_with_mask( im_with_marker, imops.generate_square_mask(square_radius + 1, True), loc=loc, center=True, ) imops.accum_inplace(im_all_cycles, im_with_marker, loc=XY(cycle_i * one_width, 0)) if separate: z.im( im_with_marker, _noaxes=True, f_plot_height=int(f_plot_height), f_plot_width=int(f_plot_height), _notools=True, f_match_aspect=True, _cspan=cspan, ) if not separate: z.im( im_all_cycles, _noaxes=True, f_plot_height=int(f_plot_height), f_plot_width=int(f_plot_width), _notools=True, f_match_aspect=True, _cspan=cspan, )
def show_raw( peak_i, field_i, channel_i, cycle_i, min_bright=min_bright, max_bright=max_bright, show_circles=show_circles, ): field_i = int(field_i) if field_i != "" else None channel_i = int(channel_i) cycle_i = int(cycle_i) if field_i is None: peak_i = int(peak_i) peak_records = df[df.peak_i == peak_i] field_i = int(peak_records.iloc[0].field_i) else: peak_i = None all_sig = res.sig() # mask_rects_for_field = res.raw_mask_rects_df()[field_i] # Temporarily removed. This is going to involve some groupby Super-Pandas-Kungfu(tm) # Here is my too-tired start... """ import pandas as pd df = pd.DataFrame([ (0, 0, 0, 100, 110, 120, 130), (0, 0, 1, 101, 111, 121, 131), (0, 0, 2, 102, 112, 122, 132), (0, 1, 0, 200, 210, 220, 230), (0, 1, 1, 201, 211, 221, 231), (0, 1, 2, 202, 212, 222, 232), (1, 0, 0, 1100, 1110, 1120, 1130), (1, 0, 1, 1101, 1111, 1121, 1131), (1, 0, 2, 1102, 1112, 1122, 1132), (1, 1, 0, 1200, 1210, 1220, 1230), (1, 1, 1, 1201, 1211, 1221, 1231), ], columns=["frame_i", "ch_i", "cy_i", "x", "y", "w", "h"]) def rec(row): return row[["x", "y"]] df.set_index("frame_i").groupby(["frame_i"]).apply(rec) """ mask_rects_for_field = None cspan = (min_bright, max_bright) circle = cspan[1] * imops.generate_donut_mask(4, 3) square = cspan[1] * imops.generate_square_mask(square_radius) z = ZPlots() sig_for_channel = all_sig[:, channel_i, :] sig_top = np.median(sig_for_channel) + np.percentile( sig_for_channel, 99.9) if peak_i is not None: rad = sig_for_channel[peak_i] rad = rad.reshape(1, rad.shape[0]) print("\n".join([ f" cycle {cycle:2d}: {r:6.0f}" for cycle, r in enumerate(rad[0]) ])) z.scat(x=range(len(rad[0])), y=rad[0]) z.im(rad, _cspan=(0, sig_top), f_plot_height=50, _notools=True) # This is inefficient because the function we will call # does the same image load, but I'd prefer to not repeat # the code here and want to be able to call this fn # from notebooks: _raw_peak_i_zoom( field_i, res, df, peak_i, channel_i, zoom=3.0, square_radius=square_radius, x_pad=1, cspan=cspan, separate=False, show_circles=show_circles, ) if result_block == "sigproc_v1": im = res.raw_chcy_ims(field_i).copy()[channel_i, cycle_i] else: im = res.aln_ims[field_i, channel_i, cycle_i].copy() if peak_i is not None: cy_rec = peak_records[peak_records.cycle_i == cycle_i].iloc[0] im_marker = square if peak_i_square else circle imops.accum_inplace( im, im_marker, loc=XY(cy_rec.raw_x, cy_rec.raw_y), center=True, ) elif show_circles: peak_records = df[(df.field_i == field_i) & (df.cycle_i == cycle_i)] # In the case of a field with no peaks, n_peaks may be NaN, so check that we have # some peaks before passing NaNs to imops. if peak_records.n_peaks.iloc[0] > 0: for i, peak in peak_records.iterrows(): imops.accum_inplace( im, circle, loc=XY(peak.raw_x, peak.raw_y), center=True, ) z.im( im, f_title=f"ch_i={channel_i} cy_i={cycle_i} fl_i={field_i}", _full=True, _noaxes=True, _cspan=(float(min_bright), float(max_bright)), ) displays.fix_auto_scroll()