def kimura_dist_plot(env, df):
    import seaborn
    import matplotlib.pyplot as plt

    ancestors = list(set(df["Ancestor"]))
    # fig, axes = plt.subplots(2, math.ceil(len(ancestors)/2), sharex=True, sharey=True)
    #
    # for anc, ax in zip(ancestors, axes.ravel()):
    #
    #     df_group = df[df["Ancestor"] == anc]
    #     seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc],
    #                      hist=False)
    #     ax.set_title(anc)
    # plt.show()

    fig, ax = plt.subplots()  # type: plt.Figure, plt.Axes
    for anc in ancestors:
        df_group = df[df["Ancestor"] == anc]
        seaborn.distplot(df_group["Average-Kimura"],
                         ax=ax,
                         color=CM.get_map("ancestor")[anc],
                         hist=False,
                         label=anc)
        # ax.set_title(anc)

    ax.legend(ancestors)
    ax.set_ylabel("PDF")
    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
    plt.show()
Пример #2
0
def catplot(df, x, y, hue=None, kind="box", figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    g = sns.catplot(x=x, y=y, data=df, kind=kind, hue=hue, legend=False, aspect=1.5, **sns_kwargs)

    if kind == "point":
        plt.setp(g.ax.lines, linewidth=1)  # set lw for all lines of g axes
        # plt.setp(g.ax.lines, markersize=0)  # set lw for all lines of g axes
    #
    # if fontsize:
    #     g.set_xlabels(x, fontsize=fontsize)
    #     g.set_ylabels(x, fontsize=fontsize)

    FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)




    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title)
        else:
            plt.legend(loc=legend_loc)

    # plt.savefig(next_name(pd_work))
    save_figure(figure_options)
    plt.show()
Пример #3
0
def scatterplot(df, x, y, hue=None, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None

    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    ax = get_value(kwargs, "ax", None)

    identity = get_value(kwargs, "identity", False)

    if not ax:
        _, ax = plt.subplots()

    g = sns.scatterplot(x=x, y=y, hue=hue, data=df, linewidth=0, **sns_kwargs)

    if identity:
        add_identity(ax, color="r", ls="--")

    FigureOptions.set_properties_for_axis(ax, figure_options)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)
    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title)
        else:
            plt.legend(loc=legend_loc)


    save_figure(figure_options)
    plt.show()
Пример #4
0
def tsplot(df, x, y, hue=None, figure_options=None, **kwargs):
    _, ax = plt.subplots()
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    # g = sns.lmplot(x=x, y=y, hue=hue, data=df, aspect=2, legend=False, ci=None)
    sns.tsplot(df[y].values, df[x].values, **sns_kwargs)

    if hue is not None:
        plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))

    # FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options)
    save_figure(figure_options)
    plt.show()
Пример #5
0
def distplot(df, x, figure_options=None, **kwargs):
    _, ax = plt.subplots()

    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    if "kde" not in sns_kwargs:
        sns_kwargs["kde"] = True

    g = sns.distplot(df[x], bins=50, **sns_kwargs)

    FigureOptions.set_properties_for_axis(g.axes, figure_options)
    save_figure(figure_options)
    plt.show()
Пример #6
0
def barplot(df, x, y, hue, figure_options=None, **kwargs):
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    ax = get_value(kwargs, "ax", None)

    g = sns.barplot(x=x, y=y, data=df, hue=hue,  ax=ax, **sns_kwargs)

    if hue is not None:
        plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))

    FigureOptions.set_properties_for_axis(g, figure_options)
    plt.tight_layout()
    save_figure(figure_options)
    # plt.tight_layout(rect=[-0.3,0,1,1.2])
    plt.show()
Пример #7
0
def kdeplot(df, x, y, hue=None, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())

    _, ax = plt.subplots()
    y_df = None if y is None else df[y]

    g = sns.kdeplot(df[x], y_df, legend=False, **sns_kwargs)

    if hue is not None:
        plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))

    FigureOptions.set_properties_for_axis(ax, figure_options)
    save_figure(figure_options)
    plt.show()
def contour_kimura_per_ancestor(env, df):
    import seaborn
    import matplotlib.pyplot as plt

    ancestors = sorted(list(set(df["Ancestor"])))
    fig, axes = plt.subplots(2,
                             math.ceil(len(ancestors) / 2),
                             sharex=True,
                             sharey=True,
                             figsize=(6, 6))

    for anc, ax in zip(ancestors, axes.ravel()):

        df_group = df[df["Ancestor"] == anc]
        seaborn.kdeplot(df_group["Min-Kimura"].values,
                        df_group["Max-Kimura"].values,
                        ax=ax)
        ax.set_title(anc)
        # ax.set_ylim([0.45, 0.525])

    # fig.xlabel("Min-Kimura")
    # plt.xlabel("Min-Kimura")
    # plt.ylabel("Max-Kimura")
    # fig.text(0.5, 0.04, 'Min-Kimura', ha='center')
    # fig.text(0.04, 0.5, 'Max-Kimura', va='center', rotation='vertical')
    fig.add_subplot(111, frameon=False)
    # # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel("Minimum Kimura", labelpad=20)
    plt.ylabel("Maximum Kimura", labelpad=30)

    fig.tight_layout()
    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))

    plt.show()
Пример #9
0
def lmplot(df, x, y, hue=None, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None

    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    if "aspect" not in sns_kwargs:
        sns_kwargs["aspect"] = 2

    g = sns.lmplot(x=x, y=y, hue=hue, data=df, legend=False, **sns_kwargs)

    FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)
    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            g.axes[0][0].legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title)
        else:
            g.axes[0][0].legend(loc=legend_loc)

    save_figure(figure_options, fig=g.fig)
    plt.subplots_adjust(right=1)
    plt.show()
    return g
Пример #10
0
def lineplot(df, x, y, hue=None, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None

    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    ax = get_value(kwargs, "ax", None)
    show = get_value(kwargs, "show", ax is None)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)
    legend_ncol = get_value(kwargs, "legend_ncol", 1)

    identity = get_value(kwargs, "identity", False)

    if not ax:
        fig, ax = plt.subplots()
    else:
        fig = ax.get_figure()

    g = sns.lineplot(x=x, y=y, hue=hue, data=df, ax=ax, legend=legend, **sns_kwargs)

    if identity:
        add_identity(ax, color="r", ls="--")

    FigureOptions.set_properties_for_axis(ax, figure_options)
    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title, ncol=legend_ncol)
        else:
            plt.legend(loc=legend_loc, ncol=legend_ncol, title=title)
        if title is not None and len(title)  == 0:
            handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles=handles[1:], labels=labels[1:], ncol=legend_ncol)

    if show:
        save_figure(figure_options, fig)
        plt.show()
Пример #11
0
def plot_per_tool_by_genome_type(env, df):
    # type: (Environment, pd.DataFrame) -> None

    list_tags = get_tags_for_5prime(df)

    num_tags = len(list_tags)

    fig, ax = plt.subplots(2,
                           math.ceil(num_tags / 2),
                           sharey="all",
                           sharex="all")
    fig.add_axes([.91, .3, .03, .4])
    cbar_ax = fig.axes[-1]
    #
    # save_figure(FigureOptions(
    #     save_fig=next_name(env["pd-work"])
    #         ), fig)
    #
    # plt.show()
    # return

    import numpy as np
    kws = {
        # "levels": np.arange(0, 1, 0.2),
        # "vmin": 0, "vmax": 0.55,
        # "norm": True
        "xlim": [0.2, 0.8],
        "ylim": [0, 35],
        "cbar_max": 1,
        "num_steps": 35,
    }

    cbar_enable = {
        "cbar_ax": cbar_ax,
        "cbar": True,
    }

    counter = 0
    for tag, c, a in zip(list_tags, ["b", "g", "r", "o"], ax.ravel()):
        x, y, y_l, y_u = loess_with_stde(
            df, "GC", f"M:{tag}", a, tag.replace("=", ","), **kws,
            **cbar_enable if counter == 0 else dict())

        a.set_title(
            tag.replace("=",
                        ",").replace("NCBI",
                                     "PGAP").replace("GMS2", "GeneMarkS-2"))
        a.set_ylabel("")
        a.set_xlabel("")
        # a.set_ylim([65,100])
        # a.set_ylim([0, 35])
        # eps_x = [z for z in a.get_ylim()]
        # eps_x[0] -= 0.01
        # eps_x[1] += 0.01
        #
        # a.set_xlim(eps_x)
        # if counter % 2 == 0:
        #     a.set_ylabel("Percentage of gene-start differences")
        # if counter >= math.ceil(num_tags/2):
        #     a.set_xlabel("GC")
        counter += 1

        mappable = a.collections[0]

    # plt.legend(loc="best")
    figure_options = FigureOptions(save_fig=next_name(env["pd-work"]))
    fig.add_subplot(111, frameon=False)
    # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel("GC", labelpad=30)
    plt.ylabel("Percentage of gene-start differences", labelpad=30)
    # plt.xlabel("GC")
    # plt.ylabel("Percent 5' Match")

    # mappable=create_mappable_for_colorbar(np.arange(0, 0.4, 0.05), "Reds")
    # plt.colorbar(mappable, cax=cbar_ax, cmap="Reds")
    fig.tight_layout(rect=[-0.02, -0.02, .9, 1])

    # plt.tight_layout()
    # FigureOptions.set_properties_for_axis(ax, figure_options)

    save_figure(figure_options, fig)
    plt.show()
    #
    # for tag in list_tags:
    #     sns.jointplot(df, "GC", f"M:{tag}")
    #
    #
    # x = df["GC"].values
    # y = df[f"M:{list_tags[0]}"].values
    # order = np.argsort(x)
    # # run it
    # y_sm, y_std = lowess(x, y, f=1. / 5.)
    # # plot it
    # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS')
    # plt.fill_between(x[order], y_sm[order] - 1.96 * y_std[order],
    #                  y_sm[order] + 1.96 * y_std[order], alpha=0.3, label='LOWESS uncertainty')
    # # plt.plot(x, y, 'k.', label='Observations')
    # # plt.legend(loc='best')
    # # run it
    # y_sm, y_std = lowess(x, y, f=1. / 5.)
    # # plot it
    # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS')
    # plt.fill_between(x[order], y_sm[order] - y_std[order],
    #                  y_sm[order] + y_std[order], alpha=0.3, label='LOWESS uncertainty')
    # # plt.plot(x, y, 'k.', label='Observations')
    # plt.legend(loc='best')
    # plt.show()

    # calculate a 60 day rolling mean and plot
    # calculate a 60 day rolling mean and plot

    # df_stacked = stack_columns_as_rows(
    #     df, [f"M:{tag}" for tag in list_tags], "Percent 5p Match", [f"M:{tag}" for tag in list_tags], "Tools"
    # )
    #
    #
    # sns.lmplot(
    #     df_stacked, "GC", "Percent 5p Match", hue="Tools",
    #     figure_options=FigureOptions(
    #         xlabel="Genome GC",
    #         ylim=[70, 100]
    #     ),
    #     legend_loc="best",
    #     sns_kwargs={"scatter_kws": {"s": 5, "alpha": 0.3}, "lowess": False, "scatter": False, "aspect": 1.5}
    # )
    # # sns.tsplot(df_stacked, "GC", "Percent 5p Match", hue="Tools", sns_kwargs={"ci":"sd"})
    # fig, ax = plt.subplots(1, 1)
    # seaborn.lineplot(df["GC"], df[f"M:{list_tags[0]}"])
    # # seaborn.tsplot(df, "GC", f"M:{list_tags[0]}" , ci="sd")
    # plt.show()

    plt.show()
def analyze_upstream_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = os_join(env["pd-work"], "upstream_distances")
    mkdir_p(pd_work)

    # remove empty lists
    df = df[df["Upstream-distance"] != "[]"].copy()
    df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval)
    df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent)

    # compute consistencies with different flexibilities
    for flexibility in {0, 3}:
        df["PC(x,{})".format(flexibility)] = df[[
            "Most frequent upstream", "Upstream-distance"
        ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[
            "Most frequent upstream"], flexibility),
                 axis=1)

    df = df[df["Support"] > 10].copy()

    # for mf in range(-20, 50):
    #     df_mf = df[df["Most frequent upstream"] == mf]
    #     if len(df_mf) < 50:
    #         continue
    #
    #     sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 0),
    #         save_fig=next_name(pd_work),
    #         xlim=(0,1)
    #     ))
    #     sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 3),
    #         save_fig=next_name(pd_work),
    #         xlim=(0, 1)
    #     ))

    # plot distribution of Average PC
    import seaborn
    import matplotlib.pyplot as plt

    df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) &
                (df["Most frequent upstream"] > -50)]
    # NCBI consistency as a func
    df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) &
            (df["Most frequent upstream"] < 100) &
            (df["Most frequent upstream"] > -50)]

    df_tmp = stack_columns_as_rows(
        df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)",
                "Ancestor"]], ["PC(x,0)", "PC(x,3)"],
        "PC(x,f)",
        None,
        label_col="Flexibility")
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #             hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility")
    # plt.show()

    sns.lmplot(df_tmp,
               "Most frequent upstream",
               "PC(x,f)",
               hue="Flexibility",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    sns.distplot(df,
                 "Most frequent upstream",
                 figure_options=FigureOptions(save_fig=next_name(pd_work)),
                 sns_kwargs={"kde": True})

    import seaborn
    # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor")
    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename(
         'Percentage (by clade)').reset_index().pipe(
             (seaborn.catplot, 'data'),
             x="Most frequent upstream",
             y='Percentage (by clade)',
             hue="Ancestor",
             kind='point',
             scale=0.5,
             legend=False,
             palette=CM.get_map("ancestor"),
             aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Percent of components (by clade)")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts().rename(
         'number').reset_index().pipe((seaborn.catplot, 'data'),
                                      x="Most frequent upstream",
                                      y='number',
                                      hue="Ancestor",
                                      kind='point',
                                      scale=0.5,
                                      legend=False,
                                      palette=CM.get_map("ancestor"),
                                      aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Number of components")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    f, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    for ancestor, df_group in df.groupby("Ancestor"):
        seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1)

        # ax2.set_ylim(0, 3)
        ax2.yaxis.set_ticks([])
        seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2)
        ax1.set_xlabel('x var')
        ax1.set_ylabel('Counts')
    # g = seaborn.FacetGrid(df, hue="Ancestor")
    # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True)
    plt.show()

    print(df["Most frequent upstream"].value_counts(normalize=True))

    sns.lmplot(
        df,
        "Most frequent upstream",
        "PC(x,0)",
        hue="Ancestor",
        sns_kwargs={
            "scatter": False,
            "lowess": True,
            "palette": CM.get_map("ancestor")
        },
        figure_options=FigureOptions(save_fig=next_name(pd_work),
                                     xlim=[-7, None],
                                     ylim=[0, 1]),
    )

    sns.lmplot(df,
               "Most frequent upstream",
               "PC(x,3)",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    # NCBI sensitivity
    # collect:
    # average 5' per ancestor, r,

    ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)]
    list_collect = list()
    for r in ranges:

        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])

        df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter])
        # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r))

        df_summary_per_gcfid = df_summary_per_gcfid.groupby(
            "Ancestor", as_index=False).mean()
        df_summary_per_gcfid["Range"] = str(r)
        list_collect.append(df_summary_per_gcfid)

    df_tmp = pd.concat(list_collect, sort=False)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # do not average per gcfid - average per ancestor
    list_collect = list()

    range_avgs = list()
    range_label = list()

    for r in ranges:
        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])
        df_r = df[r_filter]

        for ancestor, df_group in df_r.groupby(
                "Ancestor", as_index=False):  # type: str, pd.DataFrame

            f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & (
                df_group["NCBI"])
            f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & (
                df_group["(GMS2=SBSP)!=NCBI"])

            sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float(
                f_gms2_eq_sbsp_with_ncbi_pred.sum())
            list_collect.append({
                "Ancestor":
                ancestor,
                "Range":
                str(r),
                "range_avg": (r[1] + r[0]) / 2.0,
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP":
                sensitivity,
                "GMS2=SBSP":
                f_gms2_eq_sbsp_with_ncbi_pred.sum()
            })

        range_label.append(r)
        range_avgs.append((r[1] + r[0]) / 2.0)

    df_tmp = pd.DataFrame(list_collect)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    ancestors = list(set(df_tmp["Ancestor"]))
    fig, axes = plt.subplots(
        len(ancestors),
        1,
        sharex="all",
    )
    for ancestor, ax in zip(ancestors, axes.ravel()):  # type: str, plt.Axes
        ax2 = ax.twinx()
        curr_df = df_tmp[df_tmp["Ancestor"] == ancestor]
        seaborn.lineplot("range_avg",
                         "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                         data=curr_df,
                         ax=ax)
        seaborn.lineplot("range_avg",
                         "GMS2=SBSP",
                         data=curr_df,
                         color='r',
                         legend=False,
                         ax=ax2)
        ax.set_ylabel(None)
        ax2.set_ylabel(None)
        ax.set_xlabel("Range Average")

    plt.xticks(range_avgs, range_label)
    plt.show()

    fig, ax = plt.subplots()
    ax2 = ax.twinx()
    seaborn.lineplot("range_avg",
                     "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                     data=df_tmp,
                     ax=ax,
                     color="b",
                     ci=None,
                     hue="Ancestor")
    seaborn.lineplot("range_avg",
                     "GMS2=SBSP",
                     data=df_tmp,
                     ci=None,
                     color='r',
                     legend=False,
                     ax=ax2,
                     hue="Ancestor")
    # plt.xticks(range_avgs, range_label)
    ax.set_ylim([0, None])
    ax2.set_ylim([0, None])

    ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP")
    ax2.set_ylabel("Number of GMS2=SBSP genes")
    ax.set_xlabel("Range Average")

    ax.yaxis.label.set_color('b')
    ax2.yaxis.label.set_color('r')
    ax.set_xlabel("Distance to upstream gene (nt)")
    plt.show()

    # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work)
    #
    # for ancestor, df_group in df.groupby("Ancestor", as_index=False):
    #     sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor)
    #     sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor)

    a = 0
def heat_map_Kimura_accuracy(env, df_all, x, y, num_steps=20, balance=False):
    # type: (Environment, pd.DataFrame, str, str, int) -> None
    import matplotlib.pyplot as plt

    ancestors = sorted(list(set(df_all["Ancestor"])))
    fig, axes = plt.subplots(2,
                             math.ceil(len(ancestors) / 2),
                             sharex=True,
                             sharey=True)
    cbar_ax = fig.add_axes([.91, .3, .03, .4])

    # fig = plt.figure()
    num_rows = 2
    num_cols = math.ceil(len(ancestors) / 2)

    axis_idx = 0
    curr_row = 0
    curr_col = 0
    for ancestor, df in df_all.groupby("Ancestor", as_index=False):
        ax = axes.ravel()[axis_idx]
        # ax = plt.subplot2grid((num_rows, num_cols), (curr_row, curr_col))
        axis_idx += 1
        curr_col += 1
        if curr_col == math.ceil(len(ancestors) / 2):
            curr_row += 1
            curr_col = 0

        min_x = min(df[x])
        max_x = max(df[x]) + 0.000000001

        min_y = min(df[y])
        max_y = max(df[y]) + 0.000000001

        if balance:
            min_x = min_y = min(min_x, min_y)
            max_x = max_y = max(max_x, max_y)

        ss_x = (max_x - min_x) / float(num_steps)
        ss_y = (max_y - min_y) / float(num_steps)

        num_col = num_steps
        num_row = num_steps
        import numpy as np
        gms2_eq_sbsp_and_ncbi = np.zeros([num_row, num_col], dtype=float)
        gms2_eq_sbsp_eq_ncbi = np.zeros([num_row, num_col], dtype=float)

        df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"])
        df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"])

        for index in df.index:

            x_val = df.at[index, x]
            y_val = df.at[index, y]

            x_pos = int((x_val - min_x) / ss_x)
            y_pos = int((y_val - min_y) / ss_y)

            gms2_eq_sbsp_and_ncbi[x_pos][y_pos] += 1 if df.at[
                index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0
            gms2_eq_sbsp_eq_ncbi[x_pos][y_pos] += 1 if df.at[
                index, "GMS2=SBSP=NCBI"] else 0

        gms2_eq_sbsp_and_ncbi[gms2_eq_sbsp_and_ncbi < 10] = 0
        accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi)
        # accuracy = np.flip(accuracy, 0)

        import seaborn
        import matplotlib.pyplot as plt

        xticks = list(range(0, num_steps, int(num_steps / 5)))
        yticks = list(range(0, num_steps, int(num_steps / 5)))

        l_x = np.arange(min_x, max_x, ss_x)
        l_y = np.arange(min_y, max_y, ss_y)
        xticklabels = [round(l_x[i], 2) for i in xticks]
        yticklabels = [round(l_y[i], 2) for i in yticks]
        g = seaborn.heatmap(accuracy.transpose(),
                            vmin=0,
                            vmax=1,
                            xticklabels=xticklabels,
                            yticklabels=yticklabels,
                            ax=ax,
                            cbar=False)
        # cbar_ax=None if axis_idx != 0 else cbar_ax, cbar=axis_idx==0)

        # cbar=g.cbar

        g.invert_yaxis()
        g.set_xticks(xticks)
        g.set_yticks(yticks)
        g.set_xticklabels(xticklabels, rotation=0)

        # g.set_xlabel("Min Kimura")
        # g.set_ylabel("Max Kimura")
        g.set_title(ancestor)
        mappable = ax.collections[0]

    # im = plt.gca().get_children()[0]
    # cax = fig.add_axes([0.8, 0.1, 0.03, 0.8])
    cbar_ax = fig.axes[-1]

    # fig.tight_layout(rect=[0, 0, .9, 1])
    fig.add_subplot(111, frameon=False)
    # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel(x, labelpad=20)
    plt.ylabel(y, labelpad=30)

    # ax3 = plt.subplot2grid((num_rows, num_cols), (0, num_cols - 1), rowspan=num_rows,
    #                        )
    plt.colorbar(mappable, cax=cbar_ax)
    fig.tight_layout(rect=[0, 0, .9, 1])

    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))

    plt.show()
def visualize_matrix_column(env, df, col):
    # type: (Environment, pd.DataFrame, str) -> None

    # first, remove all NA for column
    df = df[~df[col].isna()]  # we only need non-NA

    fp = FontProperties()
    fp.set_family("monospace")

    # create N x 6 x 4 matrix for RBS
    mat = create_numpy_for_column(df, col)
    mat = mat.reshape((mat.shape[0], mat.shape[1] * mat.shape[2]))

    # get interesting features to view data by
    gc = df["GC"]
    group = df["GENOME_TYPE"]

    for r in range(1):

        reducer = umap.UMAP(random_state=r)
        reducer = reducer.fit(mat)
        embedding = reducer.embedding_
        print(embedding.shape)

        # fig, ax = plt.subplots()
        #
        # plt.scatter(embedding[:, 0], embedding[:, 1], c=gc, marker="+")
        # plt.colorbar()
        # plt.show()
        # themes = ["fire", "viridis", "inferno", "blue", "red", "green", "darkblue", "darkred", "darkgreen"]
        # fig, axes = plt.subplots(3, 3)
        # for ax, theme in zip(axes.ravel(), themes):
        #     fig, ax = plt.subplots()
        #     umap.plot.points(reducer, values=gc, theme=theme, )
        #     plt.show()
        ax = umap.plot.points(reducer, values=gc, cmap="viridis")
        mappable = create_mappable_for_colorbar(gc, "viridis")
        plt.colorbar(mappable)
        plt.title(col)
        plt.tight_layout()
        save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
        plt.show()

        umap.plot.points(reducer, labels=group.values, color_key_cmap="Paired")
        plt.title(col)
        plt.tight_layout()
        save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
        plt.show()

        # umap.plot.points(reducer, labels=group.values, color_key_cmap="Dark2")
        # plt.title(col)
        # save_figure(FigureOptions(
        #     save_fig=next_name(env["pd-work"])
        # ))
        # plt.show()

        umap.plot.points(reducer, labels=df["Type"])
        plt.title(col)
        plt.tight_layout()
        save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
        plt.show()
def analyze_independent_predictions(max_candidates, sen_a, sen_b):
    # type: (int, float, float) -> None

    sensitivities = {
        "Random": sensitivity_random,
        "Independent": sensitivity_independent,
        "Fully dependent": sensitivity_fully_dependent
    }

    agree_given_pred = {
        "Random": agree_given_pred_random,
        "Independent": agree_given_pred_independent,
        "Fully dependent": agree_given_pred_fully_dependent
    }

    df = compute_data(sensitivities, agree_given_pred, max_candidates)

    plot_sensitivities_vs_num_candidates(sensitivities, max_candidates, sen_a,
                                         sen_b)

    sns.lineplot(
        df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)],
        "Number of candidates",
        "Probability",
        hue="Condition",
        sns_kwargs={"palette": CM.get_map("independence-conditions")},
        legend_loc="best",
        figure_options=FigureOptions(
            save_fig=next_name("."),
            ylabel=r"$P(y=s|x_1=y, x_2=y)$",
            # xlim=[None, 40]
        ))
    # error
    df["1 - Probability"] = 1 - df["Probability"]
    sns.lineplot(
        df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)],
        "Number of candidates",
        "1 - Probability",
        hue="Condition",
        sns_kwargs={"palette": CM.get_map("independence-conditions")},
        legend_loc="best",
        figure_options=FigureOptions(
            save_fig=next_name("."),
            ylabel=r"$P(y\neq s|x_1=y, x_2=y)$",
            # xlim=[None, 40]
        ))

    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4))

    sns.lineplot(df[(df["Sensitivity A"] == 0.9)
                    & (df["Sensitivity B"] == 0.9)],
                 "Number of candidates",
                 "Probability",
                 hue="Condition",
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 ax=axes[0],
                 legend=False,
                 figure_options=FigureOptions(title="Sensitivity = 0.9", ))

    sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"])
                    & (df["Number of candidates"] == 25)],
                 "Sensitivity A",
                 "Probability",
                 hue="Condition",
                 ax=axes[1],
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 figure_options=FigureOptions(
                     ylim=[0, 1.05],
                     xlim=[0, 1],
                     xlabel="Sensitivity",
                     title="Number of candidates = 25",
                 ))

    save_figure(FigureOptions(save_fig=next_name(".")), fig)
    plt.show()

    df_tmp = df[(df["Sensitivity A"] == df["Sensitivity B"])
                & (df["Condition"] == "Independent") &
                (df["Sensitivity A"].isin(
                    {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}))]
    df_tmp.rename(columns={"Sensitivity A": "Sensitivity"}, inplace=True)

    sns.lineplot(
        df_tmp,
        "Number of candidates",
        "Probability",
        hue="Sensitivity",
        figure_options=FigureOptions(
            # ylim=[0, 1.05],
            # xlim=[0, 1],
            title="Independent algorithms",
            save_fig=next_name(".")),
    )

    # for condition in set(df["Condition"]):
    #
    #     sns.kdeplot(
    #         df[(df["Condition"] == condition) & (df["Sensitivity A"] == df["Sensitivity B"])],
    #         "Sensitivity A", "Number of candidates", "Probability",
    #         figure_options=FigureOptions(
    #             title=condition
    #         ))
    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4))

    sns.lineplot(df[(df["Sensitivity A"] == 0.9)
                    & (df["Sensitivity B"] == 0.9)],
                 "Number of candidates",
                 "Agree given prediction",
                 hue="Condition",
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 ax=axes[0],
                 legend=False,
                 figure_options=FigureOptions(title="Sensitivity = 0.9", ))

    sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"])
                    & (df["Number of candidates"] == 25)],
                 "Sensitivity A",
                 "Agree given prediction",
                 hue="Condition",
                 ax=axes[1],
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 figure_options=FigureOptions(
                     ylim=[0, 1.05],
                     xlim=[0, 1],
                     xlabel="Sensitivity",
                     title="Number of targets = 25",
                 ))

    save_figure(FigureOptions(save_fig=next_name(".")), fig)
    plt.show()