Пример #1
0
def analyze_kimura_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env["pd-work"]

    df = df[df["Kimura-to-query"] != "[]"].copy()
    df["Kimura-to-query"] = df["Kimura-to-query"].apply(ast.literal_eval)
    df["Average-Kimura"] = df["Kimura-to-query"].apply(np.mean)
    df["Std-Kimura"] = df["Kimura-to-query"].apply(np.std)

    sns.lmplot(df,
               "Genome GC",
               "Average-Kimura",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   },
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work)))
    df_mean = df.groupby(["Ancestor", "GCFID"], as_index=False).mean()

    sns.lmplot(df_mean,
               "Genome GC",
               "Average-Kimura",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   },
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work)))

    # Min/max kimura
    df["Min-Kimura"] = df["Kimura-to-query"].apply(min)
    df["Max-Kimura"] = df["Kimura-to-query"].apply(max)

    contour_kimura_per_ancestor(env, df)
    one_dim_Kimura_accuracy(env, df)

    kimura_dist_plot(env, df)
    heat_map_Kimura_accuracy(env,
                             df,
                             "Min-Kimura",
                             "Max-Kimura",
                             balance=True,
                             xlabel="Minimum Kimura",
                             ylabel="Maximum Kimura")
    heat_map_Kimura_accuracy(env,
                             df,
                             "Average-Kimura",
                             "Std-Kimura",
                             balance=False)
Пример #2
0
def analyze_gms2_components_on_verified_set(env, gil):
    # type: (Environment, GenomeInfoList) -> None

    # run different components
    list_df = list()
    for gi in gil:
        list_df.append(
            analyze_gms2_components_on_verified_set_for_gi(env, gi)
        )

    df = pd.concat(list_df, ignore_index=True, sort=False)
    df["Genome"] = df.apply(fix_names, axis=1)
    print(df.to_csv())


    fig, ax = plt.subplots(figsize=(12,4))
    sns.barplot(df, "Genome", "Error", hue="Component",
                ax=ax,
                figure_options=FigureOptions(
                    save_fig=next_name(env["pd-work"])
                ),
                sns_kwargs={
                    "hue_order": reversed(["GMS2", "MGM2*", "Start Context", "RBS", "Start Codons", "Promoter", "MGM"]),
                    "palette": CM.get_map("gms2_components")

                })
def kimura_dist_plot(env, df):
    import seaborn
    import matplotlib.pyplot as plt

    ancestors = list(set(df["Ancestor"]))
    # fig, axes = plt.subplots(2, math.ceil(len(ancestors)/2), sharex=True, sharey=True)
    #
    # for anc, ax in zip(ancestors, axes.ravel()):
    #
    #     df_group = df[df["Ancestor"] == anc]
    #     seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc],
    #                      hist=False)
    #     ax.set_title(anc)
    # plt.show()

    fig, ax = plt.subplots()  # type: plt.Figure, plt.Axes
    for anc in ancestors:
        df_group = df[df["Ancestor"] == anc]
        seaborn.distplot(df_group["Average-Kimura"],
                         ax=ax,
                         color=CM.get_map("ancestor")[anc],
                         hist=False,
                         label=anc)
        # ax.set_title(anc)

    ax.legend(ancestors)
    ax.set_ylabel("PDF")
    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
    plt.show()
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    df = pd.read_csv(args.pf_input, header=0)
    add_percentages(df)
    sns.set_context(context="paper", font_scale=1.5)

    #clean up
    df = df[df["Total Candidates"] < 100]

    colors = ["windows blue", "amber", "faded green", "dusty purple"]
    palette = sns.xkcd_palette(colors)
    sns.palplot(palette)
    plt.show()
    sns.set_palette(palette)

    fig_num = 0

    plt.figure(figsize=(12, 4))
    sns.jointplot(x="gc", y="Total Candidates", data=df)
    # plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))
    plt.savefig(os.path.join(env["pd-work"], "{}.pdf".format(fig_num)),
                bbox_inches='tight')
    plt.show()
    fig_num += 1

    # Average number of candidates per GC
    df_tmp = df.groupby(["gcfid", "ancestor"], as_index=False).agg("mean")
    plt.figure(figsize=(12, 4))
    g = sns.scatterplot(x="gc",
                        y="Total Candidates",
                        data=df_tmp,
                        hue="ancestor",
                        palette=CM.get_map("ancestor"))
    plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))
    g.set(ylabel="Average number of candidates")
    g.set(xlabel="GC")
    plt.savefig(os.path.join(env["pd-work"], "{}.pdf".format(fig_num)),
                bbox_inches='tight')
    plt.show()
    fig_num += 1

    # Average number of candidates per GC
    plt.figure(figsize=(12, 4))
    g = sns.lmplot(x="gc",
                   y="Total Candidates",
                   data=df_tmp,
                   hue="ancestor",
                   aspect=2,
                   legend=False,
                   ci=None,
                   lowess=True,
                   palette=CM.get_map("ancestor"))
    plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))
    g.set(ylabel="Average number of candidates")
    g.set(xlabel="GC")
    plt.savefig(os.path.join(env["pd-work"], "{}.pdf".format(fig_num)),
                bbox_inches='tight')
    plt.show()
    fig_num += 1

    pass
def analyze_upstream_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = os_join(env["pd-work"], "upstream_distances")
    mkdir_p(pd_work)

    # remove empty lists
    df = df[df["Upstream-distance"] != "[]"].copy()
    df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval)
    df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent)

    # compute consistencies with different flexibilities
    for flexibility in {0, 3}:
        df["PC(x,{})".format(flexibility)] = df[[
            "Most frequent upstream", "Upstream-distance"
        ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[
            "Most frequent upstream"], flexibility),
                 axis=1)

    df = df[df["Support"] > 10].copy()

    # for mf in range(-20, 50):
    #     df_mf = df[df["Most frequent upstream"] == mf]
    #     if len(df_mf) < 50:
    #         continue
    #
    #     sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 0),
    #         save_fig=next_name(pd_work),
    #         xlim=(0,1)
    #     ))
    #     sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 3),
    #         save_fig=next_name(pd_work),
    #         xlim=(0, 1)
    #     ))

    # plot distribution of Average PC
    import seaborn
    import matplotlib.pyplot as plt

    df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) &
                (df["Most frequent upstream"] > -50)]
    # NCBI consistency as a func
    df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) &
            (df["Most frequent upstream"] < 100) &
            (df["Most frequent upstream"] > -50)]

    df_tmp = stack_columns_as_rows(
        df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)",
                "Ancestor"]], ["PC(x,0)", "PC(x,3)"],
        "PC(x,f)",
        None,
        label_col="Flexibility")
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #             hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility")
    # plt.show()

    sns.lmplot(df_tmp,
               "Most frequent upstream",
               "PC(x,f)",
               hue="Flexibility",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    sns.distplot(df,
                 "Most frequent upstream",
                 figure_options=FigureOptions(save_fig=next_name(pd_work)),
                 sns_kwargs={"kde": True})

    import seaborn
    # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor")
    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename(
         'Percentage (by clade)').reset_index().pipe(
             (seaborn.catplot, 'data'),
             x="Most frequent upstream",
             y='Percentage (by clade)',
             hue="Ancestor",
             kind='point',
             scale=0.5,
             legend=False,
             palette=CM.get_map("ancestor"),
             aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Percent of components (by clade)")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts().rename(
         'number').reset_index().pipe((seaborn.catplot, 'data'),
                                      x="Most frequent upstream",
                                      y='number',
                                      hue="Ancestor",
                                      kind='point',
                                      scale=0.5,
                                      legend=False,
                                      palette=CM.get_map("ancestor"),
                                      aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Number of components")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    f, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    for ancestor, df_group in df.groupby("Ancestor"):
        seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1)

        # ax2.set_ylim(0, 3)
        ax2.yaxis.set_ticks([])
        seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2)
        ax1.set_xlabel('x var')
        ax1.set_ylabel('Counts')
    # g = seaborn.FacetGrid(df, hue="Ancestor")
    # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True)
    plt.show()

    print(df["Most frequent upstream"].value_counts(normalize=True))

    sns.lmplot(
        df,
        "Most frequent upstream",
        "PC(x,0)",
        hue="Ancestor",
        sns_kwargs={
            "scatter": False,
            "lowess": True,
            "palette": CM.get_map("ancestor")
        },
        figure_options=FigureOptions(save_fig=next_name(pd_work),
                                     xlim=[-7, None],
                                     ylim=[0, 1]),
    )

    sns.lmplot(df,
               "Most frequent upstream",
               "PC(x,3)",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    # NCBI sensitivity
    # collect:
    # average 5' per ancestor, r,

    ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)]
    list_collect = list()
    for r in ranges:

        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])

        df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter])
        # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r))

        df_summary_per_gcfid = df_summary_per_gcfid.groupby(
            "Ancestor", as_index=False).mean()
        df_summary_per_gcfid["Range"] = str(r)
        list_collect.append(df_summary_per_gcfid)

    df_tmp = pd.concat(list_collect, sort=False)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # do not average per gcfid - average per ancestor
    list_collect = list()

    range_avgs = list()
    range_label = list()

    for r in ranges:
        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])
        df_r = df[r_filter]

        for ancestor, df_group in df_r.groupby(
                "Ancestor", as_index=False):  # type: str, pd.DataFrame

            f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & (
                df_group["NCBI"])
            f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & (
                df_group["(GMS2=SBSP)!=NCBI"])

            sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float(
                f_gms2_eq_sbsp_with_ncbi_pred.sum())
            list_collect.append({
                "Ancestor":
                ancestor,
                "Range":
                str(r),
                "range_avg": (r[1] + r[0]) / 2.0,
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP":
                sensitivity,
                "GMS2=SBSP":
                f_gms2_eq_sbsp_with_ncbi_pred.sum()
            })

        range_label.append(r)
        range_avgs.append((r[1] + r[0]) / 2.0)

    df_tmp = pd.DataFrame(list_collect)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    ancestors = list(set(df_tmp["Ancestor"]))
    fig, axes = plt.subplots(
        len(ancestors),
        1,
        sharex="all",
    )
    for ancestor, ax in zip(ancestors, axes.ravel()):  # type: str, plt.Axes
        ax2 = ax.twinx()
        curr_df = df_tmp[df_tmp["Ancestor"] == ancestor]
        seaborn.lineplot("range_avg",
                         "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                         data=curr_df,
                         ax=ax)
        seaborn.lineplot("range_avg",
                         "GMS2=SBSP",
                         data=curr_df,
                         color='r',
                         legend=False,
                         ax=ax2)
        ax.set_ylabel(None)
        ax2.set_ylabel(None)
        ax.set_xlabel("Range Average")

    plt.xticks(range_avgs, range_label)
    plt.show()

    fig, ax = plt.subplots()
    ax2 = ax.twinx()
    seaborn.lineplot("range_avg",
                     "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                     data=df_tmp,
                     ax=ax,
                     color="b",
                     ci=None,
                     hue="Ancestor")
    seaborn.lineplot("range_avg",
                     "GMS2=SBSP",
                     data=df_tmp,
                     ci=None,
                     color='r',
                     legend=False,
                     ax=ax2,
                     hue="Ancestor")
    # plt.xticks(range_avgs, range_label)
    ax.set_ylim([0, None])
    ax2.set_ylim([0, None])

    ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP")
    ax2.set_ylabel("Number of GMS2=SBSP genes")
    ax.set_xlabel("Range Average")

    ax.yaxis.label.set_color('b')
    ax2.yaxis.label.set_color('r')
    ax.set_xlabel("Distance to upstream gene (nt)")
    plt.show()

    # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work)
    #
    # for ancestor, df_group in df.groupby("Ancestor", as_index=False):
    #     sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor)
    #     sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor)

    a = 0
def one_dim_Kimura_accuracy(env, df_all, num_steps=20):
    # type: (Environment, pd.DataFrame, int) -> None
    import matplotlib.pyplot as plt
    pd_work = env["pd-work"]
    ancestors = sorted(list(set(df_all["Ancestor"])))
    # fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True)

    # min_x = min(df_all["Average-Kimura"])
    # max_x = max(df_all["Average-Kimura"]) + 0.000000001
    # ss_x = (max_x - min_x) / float(num_steps)
    #
    # list_df = list()
    # axis_idx = 0
    # for ancestor, df in df_all.groupby("Ancestor", as_index=False):
    #     # ax = axes.ravel()[axis_idx]
    #     # axis_idx += 1
    #
    #
    #
    #
    #
    #     import numpy as np
    #     gms2_eq_sbsp_and_ncbi = np.zeros(num_steps, dtype=float)
    #     gms2_eq_sbsp_eq_ncbi = np.zeros(num_steps, dtype=float)
    #
    #     df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"])
    #     df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"])
    #
    #     for index in df.index:
    #
    #         x_val = df.at[index, "Average-Kimura"]
    #
    #         x_pos = int((x_val-min_x) / ss_x)
    #
    #         gms2_eq_sbsp_and_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0
    #         gms2_eq_sbsp_eq_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP=NCBI"] else 0
    #
    #     accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi)
    #     # accuracy = np.flip(accuracy, 0)
    #
    #
    #     xticks = list(range(0, num_steps))
    #
    #     l_x = np.arange(min_x, max_x, ss_x)
    #     xticklabels = [round(l_x[i], 2) for i in xticks]
    #     # g = seaborn.heatmap(accuracy.transpose(), vmin=0, vmax=1, xticklabels=xticklabels, yticklabels=yticklabels, ax=ax,
    #     #                     cbar=True)
    #
    #     # g = seaborn.lineplot(xticklabels, accuracy, ax=ax, label=ancestor)
    #
    #     # cbar=g.cbar
    #
    #     # g.set_xticks(xticks)
    #
    #     curr_df = pd.DataFrame({
    #         "Average-Kimura": xticklabels,
    #         "Accuracy": accuracy,
    #         "Number-of-queries": gms2_eq_sbsp_and_ncbi
    #     })
    #     curr_df["Ancestor"] = ancestor
    #     list_df.append(curr_df)
    #
    #     # g.set_xlabel("Min Kimura")
    #     # g.set_ylabel("Max Kimura")
    #     # g.set_title(ancestor)
    #
    # df = pd.concat(list_df)     # type: pd.DataFrame
    df = bin_data_one_d(env, df_all, "Average-Kimura", num_steps)
    sns.lineplot(df,
                 "Average-Kimura",
                 "Accuracy",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
    sns.lineplot(df,
                 "Average-Kimura",
                 "Number-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    total_per_ancestor = {
        ancestor: (df["Ancestor"].isin({ancestor})).sum()
        for ancestor in ancestors
    }

    df["Percentage-of-queries"] = 0
    df["Cumulative-percentage-of-queries"] = 0
    df.reset_index(inplace=True)
    for ancestor, df_group in df.groupby(
            "Ancestor", as_index=False):  # type: str, pd.DataFrame
        df_group.sort_values("Average-Kimura", inplace=True)
        index = df_group.index

        prev = 0
        total = df_group["Number-of-queries"].sum()
        df.loc[index, "Percentage-of-queries"] = 100 * df.loc[
            index, "Number-of-queries"] / float(total)

        for i in index:
            df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[
                i, "Percentage-of-queries"]
            prev = df.loc[i, "Cumulative-percentage-of-queries"]

    fig, ax = plt.subplots(figsize=(8, 4))
    sns.lineplot(df,
                 "Average-Kimura",
                 "Percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work),
                                              ylabel="Percentage of queries",
                                              xlabel="Average Kimura"),
                 ax=ax,
                 show=True,
                 legend_loc="best",
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.lineplot(df,
                 "Average-Kimura",
                 "Cumulative-percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    # standard dev
    df = bin_data_one_d(env, df_all[df_all["Support"] > 2], "Std-Kimura",
                        num_steps)
    sns.lineplot(df,
                 "Std-Kimura",
                 "Accuracy",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
    sns.lineplot(df,
                 "Std-Kimura",
                 "Number-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    total_per_ancestor = {
        ancestor: (df["Ancestor"].isin({ancestor})).sum()
        for ancestor in ancestors
    }

    df["Percentage-of-queries"] = 0
    df["Cumulative-percentage-of-queries"] = 0
    df.reset_index(inplace=True)
    for ancestor, df_group in df.groupby(
            "Ancestor", as_index=False):  # type: str, pd.DataFrame
        df_group.sort_values("Std-Kimura", inplace=True)
        index = df_group.index

        prev = 0
        total = df_group["Number-of-queries"].sum()
        df.loc[index, "Percentage-of-queries"] = 100 * df.loc[
            index, "Number-of-queries"] / float(total)

        for i in index:
            df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[
                i, "Percentage-of-queries"]
            prev = df.loc[i, "Cumulative-percentage-of-queries"]
    sns.lineplot(df,
                 "Std-Kimura",
                 "Percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.lineplot(df,
                 "Std-Kimura",
                 "Cumulative-percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
def viz_summary_per_gcfid(env, df, title=None):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env['pd-work']
    sns.catplot(df,
                "Ancestor",
                "GMS2=SBSP % SBSP",
                kind="box",
                figure_options=FigureOptions(
                    save_fig=next_name(pd_work),
                    ylim=[None, 100],
                    title=title,
                ),
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df,
                "Ancestor",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                kind="box",
                figure_options=FigureOptions(save_fig=next_name(pd_work),
                                             ylim=[0, 20],
                                             ylabel="1 - Sen(NCBI, GMS2=SBSP)",
                                             xlabel="Clade",
                                             title=title),
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # per GC
    sns.scatterplot(df,
                    "Genome GC",
                    "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                    hue="Ancestor",
                    figure_options=FigureOptions(
                        save_fig=next_name(pd_work),
                        ylim=[0, None],
                        title=title,
                    ),
                    legend_loc="best",
                    sns_kwargs={"palette": CM.get_map("ancestor")})

    # per GC
    sns.lmplot(df,
               "Genome GC",
               "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[0, None],
                   title=title,
                   ylabel="1 - Sen(NCBI, GMS2=SBSP)",
               ),
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": False,
                   "lowess": True
               })

    sns.lmplot(df,
               "Genome GC",
               "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[0, None],
                   title=title,
                   ylabel="1 - Sen(NCBI, GMS2=SBSP)",
               ),
               legend_loc="best",
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   },
                   "aspect": 1.5
               })

    sns.lmplot(df,
               "Genome GC",
               "GMS2=SBSP",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[0, None],
                   title=title,
               ),
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   }
               })

    sns.lmplot(df,
               "Genome GC",
               "GMS2=SBSP % SBSP",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[50, 100],
                   title=title,
               ),
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   }
               })

    sns.lmplot(df,
               "Genome GC",
               "GMS2=SBSP % GMS2",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[50, 100],
                   title=title,
               ),
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   }
               })

    sns.scatterplot(df,
                    "NCBI",
                    "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                    hue="Ancestor",
                    figure_options=FigureOptions(
                        save_fig=next_name(pd_work),
                        ylim=[0, None],
                        title=title,
                    ),
                    sns_kwargs={
                        "palette": CM.get_map("ancestor"),
                    })

    sns.scatterplot(df,
                    "GMS2=SBSP",
                    "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                    hue="Ancestor",
                    figure_options=FigureOptions(
                        save_fig=next_name(pd_work),
                        ylim=[0, None],
                        title=title,
                    ),
                    sns_kwargs={
                        "palette": CM.get_map("ancestor"),
                    })

    # per GC
    sns.scatterplot(df,
                    "Genome GC",
                    "(GMS2=SBSP)!=Prodigal % GMS2=SBSP",
                    hue="Ancestor",
                    figure_options=FigureOptions(
                        save_fig=next_name(pd_work),
                        ylim=[0, None],
                        title=title,
                    ),
                    sns_kwargs={"palette": CM.get_map("ancestor")})
def viz_summary_per_gcfid_per_step(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env['pd-work']

    list_df = list()

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index,
                                                      "SBSP"].sum()
        df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index,
                                                      "GMS2"].sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index,
                                                           "GMS2=SBSP"].sum()

    tag = None
    for step in ["A", "B", "C"]:
        if tag is None:
            tag = step
        else:
            tag += "+" + step
        df_summary_per_gcfid = get_summary_per_gcfid(
            df[df["Predicted-at-step"] <= step])
        df_summary_per_gcfid["SBSP Step"] = tag
        list_df.append(df_summary_per_gcfid)

    df_per_gcfid_per_step = pd.concat(list_df, sort=False)

    import matplotlib.pyplot as plt
    # fig, ax = plt.subplots()
    #
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax,
    #              sns_kwargs={"palette": CM.get_map("verified")},
    #              legend=False
    #              )
    # for l in ax.lines:
    #     l.set_linestyle("--")
    #
    # ax2 = ax.twinx()
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2,
    #              sns_kwargs={"palette": CM.get_map("verified")},)
    #
    # fo = FigureOptions(
    #     xlabel="SBSP Step",
    #     ylabel="Percentage",
    #     # ylim=[0, 105],
    #     save_fig=next_name(env["pd-work"])
    # )
    # FigureOptions.set_properties_for_axis(ax, fo)
    # plt.subplots_adjust(bottom=0.2)
    # handles, labels = ax.get_legend_handles_labels()
    # ax.legend(handles=handles[1:], labels=labels[1:],
    #           loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25))
    #
    # plt.savefig(fo.save_fig)
    # plt.show()

    fig, axes = plt.subplots(3, 2, sharex="all", sharey="row")
    ax = axes[:, 0]

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Sensitivity",
                     ylim=[85, 105],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percent of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    fig.align_ylabels(ax)

    # plt.savefig(next_name(env["pd-work"]))
    # plt.show()

    # fig, ax = plt.subplots(3, 1, sharex="all")
    ax = axes[:, 1]
    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Sensitivity",
                     ylim=[85, 105],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percent of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "GMS2=SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    ax[2].get_legend().remove()

    fig.align_ylabels(ax)

    for ax in axes.ravel():
        ax.set_xlabel("Steps")

    axes[0][0].set_title("SBSP")
    axes[0][1].set_title("GMS2=SBSP")

    fig.subplots_adjust(bottom=0.21)

    # handles, labels = ax.get_legend_handles_labels()
    # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25))
    handles, labels = ax.get_legend_handles_labels()
    labels[0] = "Genome"
    fig.legend(handles=handles, labels=labels, loc="lower center",
               ncol=3)  #, bbox_to_anchor=(0.5, -0.25))
    plt.savefig(next_name(env["pd-work"]))
    plt.show()

    # three plots

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index,
               "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index,
               "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) &
                                                     (df_group["NCBI"])).sum()

    df_all = get_summary_per_gcfid(df)

    print(df_all[[
        "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)",
        "Sen(GMS2=SBSP,NCBI)"
    ]].to_string(index=False))

    print(df_all[[
        "GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)",
        "Cov2(GMS2=SBSP,NCBI)"
    ]].to_string(index=False))

    import sys
    sys.exit()
Пример #9
0
def viz_per_genome(env, df):
    # type: (Environment, pd.DataFrame) -> None

    df_grp = df.groupby(["Genome", "Ancestor"], as_index=False).mean()

    sns.catplot(df_grp,
                "Ancestor",
                "BLAST",
                figure_options=FigureOptions(save_fig=next_name(
                    env["pd-work"]),
                                             xlabel="Clade",
                                             ylabel="Number of BLASTp Hits"),
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # list_grp = list()
    # for _, df_grp in df.groupby("Genome", as_index=False):
    #     indices = df_grp.index
    #
    #     list_grp.append({
    #         "Genome": df.at[indices[0], "Genome"],
    #         "Ancestor": df.at[indices[0], "Ancestor"],
    #         "= 0": len(df_grp[df_grp["BLAST"] == 0]),
    #         **{
    #             f"< {x}": len(df_grp[df_grp["BLAST"] < x]) for x in [5, 10, 20, 50, 100, 500, 1000, 5000, 10000]
    #         },
    #         "> 10000": len(df_grp[df_grp["BLAST"] > 10000])
    #     })
    #
    # df_grp = pd.DataFrame(list_grp)
    # sns.catplot(df_grp, "Ancestor", "= 0")
    # sns.catplot(df_grp, "Ancestor", "< 5")
    # sns.catplot(df_grp, "Ancestor", "< 50")
    # sns.catplot(df_grp, "Ancestor", "< 100")

    # plots
    # 1) x: number of queries with < x targets

    # compute per genome, the % of queries with hits <= 0, 5, 10, 20, 40, 80, 160, ... 240 580 1160, ...
    # plot

    list_entries = list()
    for _, df_grp in df.groupby("Genome", as_index=False):
        indices = df_grp.index
        genome = df.at[indices[0], "Genome"]
        ancestor = df.at[indices[0], "Ancestor"]

        total_queries = len(df_grp)
        curr = 0
        for n in range(40):

            list_entries.append({
                "Genome":
                genome,
                "Ancestor":
                ancestor,
                "x":
                curr,
                "y":
                100 * len(df_grp[df_grp["BLAST"] < curr]) / total_queries
            })

            # if list_entries[-1]["y"] == 100:
            #     break

            if curr == 0:
                curr = 5
            else:
                curr *= 1.2

    df_tmp = pd.DataFrame(list_entries)

    SMALL_SIZE = 16
    MEDIUM_SIZE = 22
    BIGGER_SIZE = 24
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
        'font.size': SMALL_SIZE,  # controls default text sizes
        'axes.titlesize': SMALL_SIZE,  # fontsize of the axes title
        'axes.labelsize': MEDIUM_SIZE,  # fontsize of the x and y labels
        'xtick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'ytick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'legend.fontsize': 12,  # legend fontsize
        'figure.titlesize': BIGGER_SIZE,  # fontsize of the figure title
    })

    sns.lineplot(df_tmp,
                 "x",
                 "y",
                 hue="Ancestor",
                 figure_options=FigureOptions(
                     xlabel="Number of BLASTp hits",
                     ylabel="Cumulative percentage of queries (per genome)",
                     save_fig=next_name(env["pd-work"]),
                 ),
                 legend_loc="best",
                 legend_title="",
                 legend_ncol=2,
                 sns_kwargs={
                     "ci": "sd",
                     "palette": CM.get_map("ancestor")
                 })

    sns.lineplot(df_tmp,
                 "y",
                 "x",
                 hue="Ancestor",
                 figure_options=FigureOptions(
                     ylabel="Number of BLASTp hits",
                     xlabel="Cumulative percentage of queries (per genome)",
                     save_fig=next_name(env["pd-work"]),
                 ),
                 legend_loc="best",
                 legend_title="",
                 legend_ncol=2,
                 sns_kwargs={
                     "ci": "sd",
                     "palette": CM.get_map("ancestor")
                 })

    SMALL_SIZE = 14
    MEDIUM_SIZE = 18
    BIGGER_SIZE = 20
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
        'font.size': SMALL_SIZE,  # controls default text sizes
        'axes.titlesize': SMALL_SIZE,  # fontsize of the axes title
        'axes.labelsize': MEDIUM_SIZE,  # fontsize of the x and y labels
        'xtick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'ytick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'legend.fontsize': 12,  # legend fontsize
        'figure.titlesize': BIGGER_SIZE,  # fontsize of the figure title
    })
    fig, axes = plt.subplots(2, 2, sharex="all", sharey="all")

    ancestors = sorted(set(df["Ancestor"]))

    for anc, ax in zip(ancestors, axes.ravel()):

        df_anc = df_tmp[df_tmp["Ancestor"] == anc]
        sns.lineplot(df_anc[df_anc["x"] <= 40],
                     "x",
                     "y",
                     hue="Ancestor",
                     legend=None,
                     ax=ax,
                     sns_kwargs={
                         "ci": "sd",
                         "palette": CM.get_map("ancestor")
                     })
        ax.set_title(anc)
        ax.set_xlabel("")
        ax.set_ylabel("")

    figure_options = FigureOptions(
        xlabel="Number of BLASTp hits",
        ylabel="Cumulative percentage of\nqueries (per genome)",
        save_fig=next_name(env["pd-work"]),
    )

    fig.add_subplot(111, frameon=False)
    # # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel(figure_options.xlabel, labelpad=30)
    plt.ylabel(figure_options.ylabel, labelpad=30)

    # save_figure(figure_options, fig)
    fig.savefig(next_name(env["pd-work"]), bbox_inches="tight")
    plt.show()
Пример #10
0
def viz_summary_per_gcfid_per_step(env, df):
    # type: (Environment, pd.DataFrame) -> None

    # gather analysis for steps A, A+B, and A+B+C
    list_df = list()  # type: List[pd.DataFrame]

    # compute total number of predictions per tool, per genome
    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index,
                                                      "SBSP"].sum()
        df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index,
                                                      "GMS2"].sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index,
                                                           "GMS2=SBSP"].sum()

    # loop over steps A, A+B, and A+B+C and collect stats
    tag = None
    for step in ["A", "B", "C"]:
        if tag is None:
            tag = step
        else:
            tag += "+" + step
        df_summary_per_gcfid = get_summary_per_gcfid(
            df[df["Predicted-at-step"] <= step])
        df_summary_per_gcfid["SBSP Step"] = tag
        list_df.append(df_summary_per_gcfid)

    df_per_gcfid_per_step = pd.concat(list_df, sort=False)

    import matplotlib.pyplot as plt
    # fig, ax = plt.subplots()
    #
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax,
    #              sns_kwargs={"palette": CM.get_map("verified")},
    #              legend=False
    #              )
    # for l in ax.lines:
    #     l.set_linestyle("--")
    #
    # ax2 = ax.twinx()
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2,
    #              sns_kwargs={"palette": CM.get_map("verified")},)
    #
    # fo = FigureOptions(
    #     xlabel="SBSP Step",
    #     ylabel="Percentage",
    #     # ylim=[0, 105],
    #     save_fig=next_name(env["pd-work"])
    # )
    # FigureOptions.set_properties_for_axis(ax, fo)
    # plt.subplots_adjust(bottom=0.2)
    # handles, labels = ax.get_legend_handles_labels()
    # ax.legend(handles=handles[1:], labels=labels[1:],
    #           loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25))
    #
    # plt.savefig(fo.save_fig)
    # plt.show()

    fig, axes = plt.subplots(3, 2, sharex="all", sharey="row")
    ax = axes[:, 0]

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Error rate (\%)",
                     ylim=[0, 20],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percentage\nof Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Number\nof Genes",
                                              ylim=[0, None]))

    fig.align_ylabels(ax)

    # plt.savefig(next_name(env["pd-work"]))
    # plt.show()

    # fig, ax = plt.subplots(3, 1, sharex="all")
    ax = axes[:, 1]
    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Error",
                     ylim=[0, None],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percentage of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "GMS2=SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    ax[2].get_legend().remove()

    fig.align_ylabels(ax)

    for ax in axes.ravel():
        ax.set_xlabel("Steps")

    axes[0][0].set_title(TOOL)
    axes[0][1].set_title(TOOLp)

    fig.subplots_adjust(bottom=0.21)

    # handles, labels = ax.get_legend_handles_labels()
    # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25))
    handles, labels = ax.get_legend_handles_labels()
    labels[0] = "Genome"
    fig.legend(handles=handles, labels=labels, loc="lower center",
               ncol=3)  #, bbox_to_anchor=(0.5, -0.25))
    plt.savefig(next_name(env["pd-work"]))
    plt.show()

    # three plots

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index,
               "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index,
               "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) &
                                                     (df_group["NCBI"])).sum()

    df_all = get_summary_per_gcfid(df)

    # map column names for tables
    columns = [
        "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)",
        "Sen(GMS2=SBSP,NCBI)", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)",
        "Cov2(GMS2=SBSP,NCBI)"
    ]
    df_sen = df_all.copy()[columns].rename(columns={
        "GCFID": "Genome",
        "NCBI": "Verified",
        "Sen(SBSP,NCBI)": "SBSP",
        "Sen(GMS2,NCBI)": "GMS2",
        "Sen(GMS2=SBSP,NCBI)": "GMS2=SBSP",
    },
                                           inplace=False)
    df_sen[["Genome", "Verified", "SBSP", "GMS2",
            "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "sensitivity.csv"),
                                 index=False)

    # print(df_all[["GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)"]].to_string(index=False))

    df_cov = df_all[columns].rename(columns={
        "GCFID": "Genome",
        "NCBI": "Verified",
        "Cov2(SBSP,NCBI)": "SBSP",
        "Cov2(GMS2,NCBI)": "GMS2",
        "Cov2(GMS2=SBSP,NCBI)": "GMS2=SBSP",
    },
                                    inplace=False)

    df_cov[["Genome", "Verified", "SBSP", "GMS2",
            "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "coverage.csv"),
                                 index=False)
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df = pd.read_csv(args.pf_data)
    df["chunk-size"] /= 1000

    import matplotlib.pyplot as plt

    fig, ax = plt.subplots()



    sns.lineplot(df[df["Tool"] == "SBSP"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                 hue="Genome",
                 sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "dashed"},
                 ax=ax,
                 legend=False,
                 figure_options=FigureOptions(
                     xlabel="Chunk size (mb)",
                     ylabel="Accuracy",
                     ylim=[74, 101],
                     save_fig=next_name(env["pd-work"])
                 ))

    for l in ax.lines:
        l.set_linestyle("--")

    sns.lineplot(df[df["Tool"] == "GMS2"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                 hue="Genome",
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend_loc="best",
                 legend_ncol=2,
                 ax=ax)




    if args.with_mgm:
        y_max = ax.get_ylim()[1]
        ax.axvline(50, 0, y_max, color="grey", linestyle="dashed")
        ax.axhline(74, 5, 49, color="grey", linestyle="dashed")
        ax.annotate("MGM", (5, 72))

    if "MGM" in set(df["Tool"]):
        sns.lineplot(df[df["Tool"] == "MGM"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                     hue="Genome",
                     sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "-."},
                     ax=ax,
                     legend=False)

    for l in ax.lines[len(ax.lines)-5:]:
        l.set_linestyle(":")

    fo = FigureOptions(
                     xlabel="Chunk size (mb)",
                     ylabel="Accuracy",
                     ylim=[74,101],
                     save_fig=next_name(env["pd-work"])
                 )
    FigureOptions.set_properties_for_axis(ax, fo)
    plt.savefig(fo.save_fig)
    plt.show()
def analyze_independent_predictions(max_candidates, sen_a, sen_b):
    # type: (int, float, float) -> None

    sensitivities = {
        "Random": sensitivity_random,
        "Independent": sensitivity_independent,
        "Fully dependent": sensitivity_fully_dependent
    }

    agree_given_pred = {
        "Random": agree_given_pred_random,
        "Independent": agree_given_pred_independent,
        "Fully dependent": agree_given_pred_fully_dependent
    }

    df = compute_data(sensitivities, agree_given_pred, max_candidates)

    plot_sensitivities_vs_num_candidates(sensitivities, max_candidates, sen_a,
                                         sen_b)

    sns.lineplot(
        df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)],
        "Number of candidates",
        "Probability",
        hue="Condition",
        sns_kwargs={"palette": CM.get_map("independence-conditions")},
        legend_loc="best",
        figure_options=FigureOptions(
            save_fig=next_name("."),
            ylabel=r"$P(y=s|x_1=y, x_2=y)$",
            # xlim=[None, 40]
        ))
    # error
    df["1 - Probability"] = 1 - df["Probability"]
    sns.lineplot(
        df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)],
        "Number of candidates",
        "1 - Probability",
        hue="Condition",
        sns_kwargs={"palette": CM.get_map("independence-conditions")},
        legend_loc="best",
        figure_options=FigureOptions(
            save_fig=next_name("."),
            ylabel=r"$P(y\neq s|x_1=y, x_2=y)$",
            # xlim=[None, 40]
        ))

    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4))

    sns.lineplot(df[(df["Sensitivity A"] == 0.9)
                    & (df["Sensitivity B"] == 0.9)],
                 "Number of candidates",
                 "Probability",
                 hue="Condition",
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 ax=axes[0],
                 legend=False,
                 figure_options=FigureOptions(title="Sensitivity = 0.9", ))

    sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"])
                    & (df["Number of candidates"] == 25)],
                 "Sensitivity A",
                 "Probability",
                 hue="Condition",
                 ax=axes[1],
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 figure_options=FigureOptions(
                     ylim=[0, 1.05],
                     xlim=[0, 1],
                     xlabel="Sensitivity",
                     title="Number of candidates = 25",
                 ))

    save_figure(FigureOptions(save_fig=next_name(".")), fig)
    plt.show()

    df_tmp = df[(df["Sensitivity A"] == df["Sensitivity B"])
                & (df["Condition"] == "Independent") &
                (df["Sensitivity A"].isin(
                    {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}))]
    df_tmp.rename(columns={"Sensitivity A": "Sensitivity"}, inplace=True)

    sns.lineplot(
        df_tmp,
        "Number of candidates",
        "Probability",
        hue="Sensitivity",
        figure_options=FigureOptions(
            # ylim=[0, 1.05],
            # xlim=[0, 1],
            title="Independent algorithms",
            save_fig=next_name(".")),
    )

    # for condition in set(df["Condition"]):
    #
    #     sns.kdeplot(
    #         df[(df["Condition"] == condition) & (df["Sensitivity A"] == df["Sensitivity B"])],
    #         "Sensitivity A", "Number of candidates", "Probability",
    #         figure_options=FigureOptions(
    #             title=condition
    #         ))
    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4))

    sns.lineplot(df[(df["Sensitivity A"] == 0.9)
                    & (df["Sensitivity B"] == 0.9)],
                 "Number of candidates",
                 "Agree given prediction",
                 hue="Condition",
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 ax=axes[0],
                 legend=False,
                 figure_options=FigureOptions(title="Sensitivity = 0.9", ))

    sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"])
                    & (df["Number of candidates"] == 25)],
                 "Sensitivity A",
                 "Agree given prediction",
                 hue="Condition",
                 ax=axes[1],
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 figure_options=FigureOptions(
                     ylim=[0, 1.05],
                     xlim=[0, 1],
                     xlabel="Sensitivity",
                     title="Number of targets = 25",
                 ))

    save_figure(FigureOptions(save_fig=next_name(".")), fig)
    plt.show()
Пример #13
0
def plot_candidate_stops(env, df):
    # type: (Environment, pd.DataFrame) -> None
    from sbsp_viz.colormap import ColorMap as CM
    plot_candidate_codons(env, df, ["TAA", "TAG", "TGA"], CM.get_map("stops"))