예제 #1
0
def plot_best_runs_pr(best_pr, all_pr, run_info, filters, **kwargs):
    df = best_pr.sort_values(by=["prec", "recall"], ascending=[False, False])
    z = kwargs.get("_zplots_context", None) or ZPlots()
    title = f"PR curves, protein identification ({len(df.pro_i.unique())} proteins), best runs ({filters.classifier})."
    with z(
        f_title=title,
        _merge=True,
        _legend="bottom_right",
        f_y_axis_label="precision",
        f_x_axis_label="read recall",
    ):
        color_by_run = len(run_info.run_iz) > 1
        groups = df.groupby("run_i")
        for run_i, run_label in zip(run_info.run_iz, run_info.run_labels):
            group = groups.get_group(run_i)
            if color_by_run:
                color = z.next()
            for i, row in group.iterrows():
                if not color_by_run:
                    color = z.next()
                pep_i = row.pep_i
                pro_id = row.pro_id
                legend_label = f"{run_label} {pro_id}"
                line_label = f"{pro_id} pep{row.pep_i:03d} {row.seqstr} {row.flustr}"
                prdf = all_pr[(all_pr.run_i == run_i) & (all_pr.pep_i == pep_i)]
                prsa = (prdf.prec.values, prdf.recall.values, prdf.score.values, None)
                plots._plot_pr_curve(
                    prsa,
                    color=color,
                    legend_label=legend_label,
                    _label=line_label,
                    _zplots_context=z,
                    **kwargs,
                )
예제 #2
0
def plot_best_runs_pr(best_pr, all_pr, run_info, filters, **kwargs):
    df = best_pr.sort_values(by=["prec", "recall"],
                             ascending=[False, False])[:filters.plot_n_peps]
    z = kwargs.get("_zplots_context", None) or ZPlots()
    z.color_reset()
    title = f"PR curves, best {len(df.pep_i.unique())} peptides, best runs. {filters.classifier} "
    run_i_to_info = {
        run_i: (run_label, z.next())
        for run_i, run_label in zip(run_info.run_iz, run_info.run_labels)
    }
    with z(
            f_title=title,
            _merge=True,
            _legend="bottom_right",
            f_y_axis_label="precision",
            f_x_axis_label="read recall",
    ):
        for i, row in df.iterrows():
            run_i = row.run_i
            pep_i = row.pep_i
            legend_label = f"{run_i_to_info[run_i][0]} p{pep_i}"
            line_label = f"{row.pep_i:03d} {row.seqstr} {row.flustr} ({row.flu_count})"
            color = run_i_to_info[run_i][1]
            prdf = all_pr[(all_pr.run_i == run_i) & (all_pr.pep_i == pep_i)]
            prsa = (prdf.prec.values, prdf.recall.values, prdf.score.values,
                    None)
            plots._plot_pr_curve(
                prsa,
                color=color,
                legend_label=legend_label,
                _label=line_label,
                **kwargs,
            )
예제 #3
0
def plot_sigproc_stats(run):
    # Hist quality, peaks per field, background, etc.
    # Maybe done as rows per field heatmap
    z = ZPlots()
    with z(_cols=4, f_plot_width=250, f_plot_height=280):
        fields_df = run.sigproc_v1.fields()
        z.hist(
            fields_df.quality,
            f_x_axis_label="quality",
            f_y_axis_label="n_frames",
            _bins=np.linspace(0, 400, 100),
        )
        by_quality = run.sigproc_v1.fields().sort_values(by="quality")

        def get(i):
            row = by_quality.iloc[i]
            im = run.sigproc_v1.raw_im(row.field_i, row.channel_i, row.cycle_i)
            return im, row

        best_im, best = get(-1)
        worst_im, worst = get(0)
        median_im, median = get(run.sigproc_v1.n_frames // 2)
        cspan = (0, np.percentile(median_im.flatten(), q=99, axis=0))
        z.im(
            best_im,
            _cspan=cspan,
            f_title=
            f"Best field={best.field_i} channel={best.channel_i} cycle={best.cycle_i}",
        )
        z.im(
            median_im,
            _cspan=cspan,
            f_title=
            f"Median field={median.field_i} channel={median.channel_i} cycle={median.cycle_i}",
        )
        z.im(
            worst_im,
            _cspan=cspan,
            f_title=
            f"Worst field={worst.field_i} channel={worst.channel_i} cycle={worst.cycle_i}",
        )
예제 #4
0
def plot_psfs(psfs, scale=1.0, **kwargs):
    divs_h, divs_w, dim_h, dim_w = psfs.shape
    assert divs_h == divs_w
    divs = divs_h
    assert dim_h == dim_w
    dim = dim_h

    z = kwargs.pop("_zplots_context", None) or ZPlots()
    with z(_size=kwargs.get("_size", max(100, int(dim * divs * scale)))):
        comp = np.zeros((divs * dim, divs * dim))
        for y_i, x_i in itertools.product(range(divs), range(divs)):
            comp[y_i * dim : (y_i + 1) * dim, x_i * dim : (x_i + 1) * dim] = psfs[
                y_i, x_i
            ]
        z.im(comp, **kwargs)
예제 #5
0
def plot_pr_breakout_peps_runs(job, peps_runs_df, filters, **kwargs):
    # TODO: see similar in plots_dev_mhc.py when plotting from df
    # This is only being called by PTM template at the moment I think.
    # Move to dev_ptm module
    """
    Single plot of PR curves for run_i+pep_i pairs given in peps_runs_df

    job: the JobResult that contains all of the RunResult objects indexed by run_i
    peps_runs_df: a df containing run_i,run_name,pep_i,ptm per row whose PR should be plotted
    """

    z = kwargs.pop("_zplots_context", None) or ZPlots()
    z.color_reset()
    n_peps = len(peps_runs_df.pep_i.unique())
    title = kwargs.pop(
        "f_title",
        f"PR curves, {n_peps} peptides, best runs ({filters.classifier})")
    with z(
            _merge=True,
            f_y_axis_label="precision",
            f_x_axis_label="read recall",
            f_title=title,
            _legend="bottom_right",
            **kwargs,
    ):
        # first pass to collect data so it can be sorted and affect legend order
        pr_data = []
        for (run_i, pep_i), row in peps_runs_df.groupby(["run_i", "pep_i"]):
            run_name = row.run_name.iloc[0]
            ptms = ";".join(list(row.ptm.astype(str)))
            ptms = f"({ptms})" if ptms else ""
            name = f'r{run_i}p{pep_i}{ptms}{"_".join(run_name.split("_")[:-1])}'
            # tuple[3] is to allow sort on PTM,reverse-precision
            pr_data += [(run_i, pep_i, name,
                         f"{row.ptm.iloc[0]}prec{1-row.prec.iloc[0]:.4f}")]

        pr_data.sort(key=lambda tup: tup[3])

        # second pass to plot data
        for (run_i, pep_i, name, first_ptm) in pr_data:
            plots.plot_pr_breakout(
                job.runs[run_i],
                pep_iz=[pep_i],
                color=z.next(),
                legend_label=name,
                _noise=0.005,
                _zplots_context=z,
            )
예제 #6
0
def plot_best_runs_peptide_observability(job, best_pr, run_info, all_runs_pr,
                                         filters, **kwargs):
    """
    peptide observability-vs-precision considering the best runs for each peptide
    as if they are one big super-run -- how well can we see peptides vs precision
    if the best run (as defined by filters) is chosen for each peptide?
    """
    z = kwargs.get("_zplots_context", None) or ZPlots()
    z.color_reset()
    classifier_name = filters.get("classifier", "").upper()
    with z(
            _merge=True,
            f_title=
            f"Peptide-Classes Precision/Recall (best {filters.plot_n_runs} + combined-best runs) ({classifier_name})",
            f_y_axis_label="precision",
            f_x_axis_label="peptide-classes recall",
    ):
        best_runs_full_pr = []
        for i, (run_i, n_pep, peps) in enumerate(
                zip(run_info.run_iz, run_info.pep_counts, run_info.peps)):
            best_runs_full_pr += [
                all_runs_pr[(all_runs_pr.run_i == run_i)
                            & (all_runs_pr.pep_i.isin(peps))]
            ]
            run = job.runs[run_i]
            if i < filters.plot_n_runs:
                label = f"{plots_dev._run_labels(run.run_name)} ({n_pep})"
                plots.plot_peptide_observability_vs_precision(
                    run,
                    pep_iz=filters.peptide_subset,
                    color=z.next(),
                    pr_axes=True,
                    _label=label,
                    legend_label=label,
                    _legend="top_right",
                    _range=(0, 1.05, 0, 1.05),
                    _zplots_context=z,
                )
        best_full_pr = pd.concat(best_runs_full_pr)
        plots._plot_peptide_observability_vs_precision(
            best_full_pr,
            color=z.next(),
            _label="combined",
            legend_label="combined (filters)",
            **kwargs,
        )
예제 #7
0
def plot_pr_scatter_peps_runs(peps_runs_df, run_info, **kwargs):
    """
    Single plot of best PR for run_i+pep_i pairs given in peps_runs_df

    peps_runs_df: a df containing run_i,run_name,pep_i,prec, and recall
    run_info: a Munch containing run_iz,run_labels,pep_counts,and peps per run, sorted

    """
    df = peps_runs_df.copy()
    df["label"] = df.apply(
        lambda x: f"{x.pep_i:03d} {x.seqstr} {x.flustr} ({x.flu_count})",
        axis=1)

    z = kwargs.pop("_zplots_context", None) or ZPlots()
    n_peps = len(peps_runs_df.pep_i.unique())
    title = kwargs.get("f_title",
                       f"{n_peps} peptides, best precision for recall-filter")
    z.color_reset()
    with z(
            _merge=True,
            f_y_axis_label="precision",
            f_x_axis_label="read recall",
            f_title=title,
            _legend="bottom_right",
            _range=(0, 1.05, 0, 1.05),
    ):
        groups = df.groupby("run_i")
        for run_i, run_label, pep_count in zip(run_info.run_iz,
                                               run_info.run_labels,
                                               run_info.pep_counts):
            try:
                group = groups.get_group(run_i)
            except KeyError:
                continue  # the run has no entries in the DF, that's ok.
            legend = f"{run_label} ({pep_count})"
            z.scat(
                source=group,
                y="prec",
                x="recall",
                _label="label",
                fill_alpha=0.8,
                color=z.next(),
                legend_label=legend,
            )
예제 #8
0
def plot_best_runs_peptide_yield(best_pr, run_info, filters, **kwargs):
    """
    For each run, indicate how many peptides it was the 'best run' for based on
    filter criteria.
    """
    total_peps = len(best_pr.pep_i.unique())
    fracs_by_run = run_info.pep_counts / total_peps
    classifier_name = filters.get("classifier", "").upper()
    title = f'"Best PR" peptide-yield for runs that produced a best peptide-pr ({classifier_name})'
    y_label = "fraction of total peptides"
    x_range = run_info.run_labels
    z = kwargs.get("_zplots_context", None) or ZPlots()
    z.cols(
        fracs_by_run,
        x=x_range,
        f_x_range=x_range,
        _x_axis_label_orientation=1.2,
        f_title=title,
        f_y_axis_label=y_label,
        f_x_axis_label="run name",
        _label=run_info.pep_counts,
        _size_x=1000,
    )
예제 #9
0
def plot_confusion_matrix_compare(
    run,
    pep_i,
    score_threshold,
    classifier=None,
):
    """
    Plot two confusion matrices - one with all calls, and one with calls culled
    according to score_threshold.  Display precision and recall for pep_i
    in the title of the comparison plots.

    classifier: None to use any available preferred classifier, or one of the
                supported classifiers in RunResult::get_available_classifiers(),
                e.g. 'rf', 'nn'

    """
    cb = run.test_call_bag(classifier=classifier)
    z = ZPlots()
    with z(_cols=2, f_x_axis_label="true pep_i"):

        def pr(cm, p_i):
            prec = np_safe_divide(cm[p_i, p_i], np.sum(cm[p_i, :]))
            recall = np_safe_divide(cm[p_i, p_i], np.sum(cm[:, p_i]))
            return prec, recall

        conf_mat = cb.conf_mat()
        prec, recall = pr(conf_mat, pep_i)
        z.im(
            np.array(conf_mat),
            _size=500,
            f_title=
            f"{run.run_name}: pep_i={pep_i} precision={prec:.2f} recall={recall:.2f} {cb.classifier_name}",
            f_y_axis_label="pep_i   predicted by classifier",
        )

        conf_mat = cb.conf_mat_at_score_threshold(score_threshold)
        prec, recall = pr(conf_mat, pep_i)
        z.im(
            np.array(conf_mat),
            _size=500,
            f_title=
            f"precision={prec:.2f} recall={recall:.2f} score={score_threshold:.2f} {cb.classifier_name}",
        )
예제 #10
0
def _raw_peak_i_zoom(
        field_i,
        res,
        df,
        peak_i,
        channel=0,
        zoom=3.0,
        square_radius=7,
        x_pad=0,
        cspan=(0, 5_000),
        separate=False,
        show_circles=True,
):
    peak_i = int(peak_i)
    peak_records = df[df.peak_i == peak_i]
    field_i = int(peak_records.iloc[0].field_i)

    im = res.raw_chcy_ims(field_i)
    all_sig = res.sig()

    square = cspan[1] * imops.generate_square_mask(square_radius)

    sig_for_channel = all_sig[peak_i, channel, :]
    sig_top = np.median(sig_for_channel) + np.percentile(sig_for_channel, 99.9)

    height = (square_radius + 1) * 2 + 1
    one_width = height + x_pad
    all_width = one_width * res.n_cycles - x_pad
    im_all_cycles = np.zeros((height, all_width))

    f_plot_height = height * zoom
    f_plot_width = all_width * zoom

    z = ZPlots()
    if show_circles:
        for cycle_i in range(res.n_cycles):
            im_with_marker = np.copy(im[channel, cycle_i])
            cy_rec = peak_records[peak_records.cycle_i == cycle_i].iloc[0]
            loc = XY(cy_rec.raw_x, cy_rec.raw_y)

            imops.accum_inplace(im_with_marker, square, loc=loc, center=True)
            im_with_marker = imops.extract_with_mask(
                im_with_marker,
                imops.generate_square_mask(square_radius + 1, True),
                loc=loc,
                center=True,
            )
            imops.accum_inplace(im_all_cycles,
                                im_with_marker,
                                loc=XY(cycle_i * one_width, 0))

            if separate:
                z.im(
                    im_with_marker,
                    _noaxes=True,
                    f_plot_height=int(f_plot_height),
                    f_plot_width=int(f_plot_height),
                    _notools=True,
                    f_match_aspect=True,
                    _cspan=cspan,
                )

    if not separate:
        z.im(
            im_all_cycles,
            _noaxes=True,
            f_plot_height=int(f_plot_height),
            f_plot_width=int(f_plot_width),
            _notools=True,
            f_match_aspect=True,
            _cspan=cspan,
        )
예제 #11
0
    def show_raw(
        peak_i,
        field_i,
        channel_i,
        cycle_i,
        min_bright=min_bright,
        max_bright=max_bright,
        show_circles=show_circles,
    ):
        field_i = int(field_i) if field_i != "" else None
        channel_i = int(channel_i)
        cycle_i = int(cycle_i)
        if field_i is None:
            peak_i = int(peak_i)
            peak_records = df[df.peak_i == peak_i]
            field_i = int(peak_records.iloc[0].field_i)
        else:
            peak_i = None

        all_sig = res.sig()

        # mask_rects_for_field = res.raw_mask_rects_df()[field_i]
        # Temporarily removed. This is going to involve some groupby Super-Pandas-Kungfu(tm)
        # Here is my too-tired start...
        """
        import pandas as pd
        df = pd.DataFrame([
            (0, 0, 0, 100, 110, 120, 130),
            (0, 0, 1, 101, 111, 121, 131),
            (0, 0, 2, 102, 112, 122, 132),
            (0, 1, 0, 200, 210, 220, 230),
            (0, 1, 1, 201, 211, 221, 231),
            (0, 1, 2, 202, 212, 222, 232),
            (1, 0, 0, 1100, 1110, 1120, 1130),
            (1, 0, 1, 1101, 1111, 1121, 1131),
            (1, 0, 2, 1102, 1112, 1122, 1132),
            (1, 1, 0, 1200, 1210, 1220, 1230),
            (1, 1, 1, 1201, 1211, 1221, 1231),
        ], columns=["frame_i", "ch_i", "cy_i", "x", "y", "w", "h"])

        def rec(row):
            return row[["x", "y"]]

        df.set_index("frame_i").groupby(["frame_i"]).apply(rec)
        """
        mask_rects_for_field = None

        cspan = (min_bright, max_bright)
        circle = cspan[1] * imops.generate_donut_mask(4, 3)
        square = cspan[1] * imops.generate_square_mask(square_radius)

        z = ZPlots()
        sig_for_channel = all_sig[:, channel_i, :]
        sig_top = np.median(sig_for_channel) + np.percentile(
            sig_for_channel, 99.9)

        if peak_i is not None:
            rad = sig_for_channel[peak_i]
            rad = rad.reshape(1, rad.shape[0])
            print("\n".join([
                f"    cycle {cycle:2d}: {r:6.0f}"
                for cycle, r in enumerate(rad[0])
            ]))
            z.scat(x=range(len(rad[0])), y=rad[0])
            z.im(rad, _cspan=(0, sig_top), f_plot_height=50, _notools=True)

            # This is inefficient because the function we will call
            # does the same image load, but I'd prefer to not repeat
            # the code here and want to be able to call this fn
            # from notebooks:
            _raw_peak_i_zoom(
                field_i,
                res,
                df,
                peak_i,
                channel_i,
                zoom=3.0,
                square_radius=square_radius,
                x_pad=1,
                cspan=cspan,
                separate=False,
                show_circles=show_circles,
            )

        if result_block == "sigproc_v1":
            im = res.raw_chcy_ims(field_i).copy()[channel_i, cycle_i]
        else:
            im = res.aln_ims[field_i, channel_i, cycle_i].copy()

        if peak_i is not None:
            cy_rec = peak_records[peak_records.cycle_i == cycle_i].iloc[0]
            im_marker = square if peak_i_square else circle
            imops.accum_inplace(
                im,
                im_marker,
                loc=XY(cy_rec.raw_x, cy_rec.raw_y),
                center=True,
            )

        elif show_circles:
            peak_records = df[(df.field_i == field_i)
                              & (df.cycle_i == cycle_i)]

            # In the case of a field with no peaks, n_peaks may be NaN, so check that we have
            # some peaks before passing NaNs to imops.
            if peak_records.n_peaks.iloc[0] > 0:
                for i, peak in peak_records.iterrows():
                    imops.accum_inplace(
                        im,
                        circle,
                        loc=XY(peak.raw_x, peak.raw_y),
                        center=True,
                    )

        z.im(
            im,
            f_title=f"ch_i={channel_i}  cy_i={cycle_i}  fl_i={field_i}",
            _full=True,
            _noaxes=True,
            _cspan=(float(min_bright), float(max_bright)),
        )
        displays.fix_auto_scroll()