def print_csvs(env, df, **kwargs):
    # type: (Environment, pd.DataFrame, Dict[str, Any]) -> None

    fn_prefix = get_value(kwargs, "fn_prefix", "", default_if_none=True)
    pd_work = env["pd-work"]

    df["Genome"] = df["Genome"].apply(short_name)

    num = 0

    print(df.columns)
    df_to_pf_csv(
        df[[
            "Genome", "NCBI", "Verified",
            "Number 3p match: Verified from NCBI",
            "Percentage 3p match: Verified from NCBI",
            "Number 5p-3p match: Verified from NCBI",
            "Percentage 5p-3p match: Verified from NCBI"
        ]], next_name(pd_work, ext="csv"))
    num += 1

    df_to_pf_csv(
        df[[
            "Genome", "GMS2", "Verified",
            "Number 3p match: Verified from GMS2",
            "Percentage 3p match: Verified from GMS2",
            "Number 5p-3p match: Verified from GMS2",
            "Percentage 5p-3p match: Verified from GMS2"
        ]], next_name(pd_work, ext="csv"))
    num += 1

    df_to_pf_csv(
        df[[
            "Genome", "SBSP", "Verified",
            "Number 3p match: Verified from SBSP",
            "Percentage 3p match: Verified from SBSP",
            "Number 5p-3p match: Verified from SBSP",
            "Percentage 5p-3p match: Verified from SBSP"
        ]], next_name(pd_work, ext="csv"))

    num += 1

    df_to_pf_csv(
        df[[
            "Genome", "Verified", "GMS2=SBSP",
            "Number 3p match: Verified from GMS2=SBSP",
            "Percentage 3p match: Verified from GMS2=SBSP",
            "Number 5p-3p match: Verified from GMS2=SBSP",
            "Percentage 5p-3p match: Verified from GMS2=SBSP"
        ]], next_name(pd_work, ext="csv"))

    num += 1

    # # by support
    analyze_by_support(df, pd_work, fn_prefix, "SBSP")
    analyze_by_support(df, pd_work, fn_prefix, "GMS2=SBSP")

    analyze_by_step_group(df, pd_work, fn_prefix, "SBSP")
    analyze_by_step_group(df, pd_work, fn_prefix, "GMS2=SBSP")
示例#2
0
def analyze_kimura_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env["pd-work"]

    df = df[df["Kimura-to-query"] != "[]"].copy()
    df["Kimura-to-query"] = df["Kimura-to-query"].apply(ast.literal_eval)
    df["Average-Kimura"] = df["Kimura-to-query"].apply(np.mean)
    df["Std-Kimura"] = df["Kimura-to-query"].apply(np.std)

    sns.lmplot(df,
               "Genome GC",
               "Average-Kimura",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   },
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work)))
    df_mean = df.groupby(["Ancestor", "GCFID"], as_index=False).mean()

    sns.lmplot(df_mean,
               "Genome GC",
               "Average-Kimura",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   },
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work)))

    # Min/max kimura
    df["Min-Kimura"] = df["Kimura-to-query"].apply(min)
    df["Max-Kimura"] = df["Kimura-to-query"].apply(max)

    contour_kimura_per_ancestor(env, df)
    one_dim_Kimura_accuracy(env, df)

    kimura_dist_plot(env, df)
    heat_map_Kimura_accuracy(env,
                             df,
                             "Min-Kimura",
                             "Max-Kimura",
                             balance=True,
                             xlabel="Minimum Kimura",
                             ylabel="Maximum Kimura")
    heat_map_Kimura_accuracy(env,
                             df,
                             "Average-Kimura",
                             "Std-Kimura",
                             balance=False)
示例#3
0
def plot_fix_min_move_max(df):
    genomes = sorted(set(df["Genome"]))

    df = df[df["Min"] == 0.1]

    df = df.sort_values("Max", axis=0)
    fig, axes = plt.subplots(2,
                             math.ceil(len(genomes) / 2),
                             sharex="all",
                             sharey="all")
    axes = axes.ravel()
    lines = list()
    for i in range(len(genomes)):
        name = genomes[i]
        ax = axes[i]
        df_tmp = df[df["Genome"] == name]

        s = ax.plot(df_tmp["Max"], df_tmp["Sensitivity"], label="Sensitivity")
        s = ax.plot(df_tmp["Max"], df_tmp["Coverage"], label="Coverage")
        ax.set_title(r"\textit{{{}. {}}}".format(name[0],
                                                 name.split()[1]),
                     style="italic")
        ax.set_ylim([49, 101])
        lines.append(s)
        if i % 2 == 0:
            ax.set_ylabel("Percentage")
        if i >= 2:
            ax.set_xlabel("Maximum Kimura")

    plt.subplots_adjust(bottom=0.17)
    fig.legend(loc="lower center", labels=["Accuracy", "Coverage"], ncol=2)
    fig.suptitle("StartLink Performance for Kimura thresholds [0.1, x]")
    plt.savefig(next_name(my_env["pd-work"]))
    plt.show()
def logo_rbs_from_gms2_mod_file(pd_figures, pf_mod, title=""):
    # type: (str, str, str) -> None

    mod = GMS2Mod.init_from_file(pf_mod)
    mm = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"])
    non = GMS2Noncoding(mod.items["NON_MAT"])
    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2)
    import logomaker as lm
    lm.Logo(lm.transform_matrix(mm.pwm_to_df(),
                                from_type="probability",
                                to_type="information",
                                background=non.pwm_to_array(0)),
            ax=axes[0])
    axes[0].set_title(title)
    axes[0].set_ylim(0, 2)

    df_spacer = pd.DataFrame({
        "Distance from start": range(len(mm._spacer)),
        "Probability": mm._spacer
    })
    sns.lineplot(df_spacer,
                 "Distance from start",
                 "Probability",
                 ax=axes[1],
                 figure_options=FigureOptions(ylim=[0, 0.4]))
    plt.tight_layout()
    plt.savefig(next_name(pd_figures))

    plt.show()
def kimura_dist_plot(env, df):
    import seaborn
    import matplotlib.pyplot as plt

    ancestors = list(set(df["Ancestor"]))
    # fig, axes = plt.subplots(2, math.ceil(len(ancestors)/2), sharex=True, sharey=True)
    #
    # for anc, ax in zip(ancestors, axes.ravel()):
    #
    #     df_group = df[df["Ancestor"] == anc]
    #     seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc],
    #                      hist=False)
    #     ax.set_title(anc)
    # plt.show()

    fig, ax = plt.subplots()  # type: plt.Figure, plt.Axes
    for anc in ancestors:
        df_group = df[df["Ancestor"] == anc]
        seaborn.distplot(df_group["Average-Kimura"],
                         ax=ax,
                         color=CM.get_map("ancestor")[anc],
                         hist=False,
                         label=anc)
        # ax.set_title(anc)

    ax.legend(ancestors)
    ax.set_ylabel("PDF")
    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
    plt.show()
示例#6
0
def analyze_gms2_components_on_verified_set(env, gil):
    # type: (Environment, GenomeInfoList) -> None

    # run different components
    list_df = list()
    for gi in gil:
        list_df.append(
            analyze_gms2_components_on_verified_set_for_gi(env, gi)
        )

    df = pd.concat(list_df, ignore_index=True, sort=False)
    df["Genome"] = df.apply(fix_names, axis=1)
    print(df.to_csv())


    fig, ax = plt.subplots(figsize=(12,4))
    sns.barplot(df, "Genome", "Error", hue="Component",
                ax=ax,
                figure_options=FigureOptions(
                    save_fig=next_name(env["pd-work"])
                ),
                sns_kwargs={
                    "hue_order": reversed(["GMS2", "MGM2*", "Start Context", "RBS", "Start Codons", "Promoter", "MGM"]),
                    "palette": CM.get_map("gms2_components")

                })
def sbsp_geom_density(df, x, y, pd_work, title=""):
    p = (ggplot(df, aes(x, color=y, fill=y)) +
         xlab(x) + ylab("Fraction") +
         geom_density(position="fill", alpha=0.5)) + \
        theme(subplots_adjust={"top": 0.9}) + \
        theme(legend_position=(.8, 0.95), legend_direction='horizontal') + ggtitle(title)

    p.save(next_name(pd_work))
    print(p)
def analyze_by_step_group(df, pd_work, fn_prefix, tag):
    # type: (pd.DataFrame, str, str, str) -> None

    list_df = list()
    for index in df.index:
        curr_df = pd.DataFrame(df.at[index, "by_step_group_{}".format(tag)])
        curr_df["Genome"] = df.at[index, "Genome"]

        if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}:
            continue

        list_df.append(curr_df)

    df_acc = pd.concat(list_df)

    sns.catplot(
        df_acc,
        "Step Group",
        "Percentage 3p match: Verified from {}".format(tag),
        hue="Genome",
        kind="point",
        figure_options=FigureOptions(
            title="Percentage 3p match versus minimum support",
            ylabel="Percentage of 3p match",
            save_fig=next_name(pd_work),
            ylim=[None, 100.5]),
    )

    sns.catplot(
        df_acc,
        "Step Group",
        "Percentage 5p-3p match: Verified from {}".format(tag),
        kind="point",
        hue="Genome",
        figure_options=FigureOptions(
            title="Percentage 5p-3p match versus minimum support",
            ylabel="Percentage of 5p-3p match",
            save_fig=next_name(pd_work),
            ylim=[90, 100.5]),
    )

    print(df_acc.to_string())
def analyze_by_support(df, pd_work, fn_prefix, tag):
    # type: (pd.DataFrame, str, str, str) -> None

    list_df = list()
    for index in df.index:
        curr_df = pd.DataFrame(df.at[index, "by_support_{}".format(tag)])
        curr_df["Genome"] = df.at[index, "Genome"]

        if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}:
            continue

        list_df.append(curr_df)

    df_acc = pd.concat(list_df)

    sns.lineplot(
        df_acc,
        "Min Support",
        "Percentage 3p match: Verified from {}".format(tag),
        hue="Genome",
        figure_options=FigureOptions(
            title="Percentage of verified genes predicted\nby {}".format(tag),
            ylabel="Percentage",
            save_fig=next_name(pd_work),
            ylim=[None, 100.5]))

    sns.lineplot(
        df_acc,
        "Min Support",
        "Percentage 5p-3p match: Verified from {}".format(tag),
        hue="Genome",
        figure_options=FigureOptions(
            title="Percentage of predicted {} genes\nwith correct 5' end".
            format(tag),
            ylabel="Percentage of 5p-3p match",
            save_fig=next_name(pd_work),
            ylim=[90, 100.5]))
示例#10
0
def plot_move_consecutive_blocks(df):
    # type: (pd.DataFrame) -> None

    genomes = sorted(set(df["Genome"]))
    fig, axes = plt.subplots(2,
                             math.ceil(len(genomes) / 2),
                             sharex="all",
                             sharey="all")
    axes = axes.ravel()

    all_kimura_values = sorted(set(df["Max"]).union(set(df["Min"])))

    for i in range(len(genomes)):
        name = genomes[i]
        ax = axes[i]

        # filter df only by those with consecutive block pair
        list_df = list()
        for j in range(1, len(all_kimura_values)):
            low = all_kimura_values[j - 1]
            high = all_kimura_values[j]

            df_tmp = df[(df["Min"] == low) & (df["Max"] == high)]
            list_df.append(df_tmp)

        df_tmp = pd.concat(list_df, sort=False)
        df_tmp["Average"] = (df_tmp["Max"] + df_tmp["Min"]) / 2.0

        df_tmp = df_tmp[df_tmp["Genome"] == name]

        s = ax.plot(df_tmp["Average"],
                    df_tmp["Sensitivity"],
                    label="Sensitivity")
        s = ax.plot(df_tmp["Average"], df_tmp["Coverage"], label="Coverage")
        ax.set_title(r"\textit{{{}. {}}}".format(name[0],
                                                 name.split()[1]),
                     style="italic")
        ax.set_ylim([49, 101])
        if i % 2 == 0:
            ax.set_ylabel("Percentage")
        if i >= 2:
            ax.set_xlabel("Average Kimura")

    plt.subplots_adjust(bottom=0.17)
    fig.suptitle("StartLink Performance for small blocks of Kimura")
    fig.legend(loc="lower center", labels=["Accuracy", "Coverage"], ncol=2)
    plt.savefig(next_name(my_env["pd-work"]))
    plt.show()
def contour_kimura_per_ancestor(env, df):
    import seaborn
    import matplotlib.pyplot as plt

    ancestors = sorted(list(set(df["Ancestor"])))
    fig, axes = plt.subplots(2,
                             math.ceil(len(ancestors) / 2),
                             sharex=True,
                             sharey=True,
                             figsize=(6, 6))

    for anc, ax in zip(ancestors, axes.ravel()):

        df_group = df[df["Ancestor"] == anc]
        seaborn.kdeplot(df_group["Min-Kimura"].values,
                        df_group["Max-Kimura"].values,
                        ax=ax)
        ax.set_title(anc)
        # ax.set_ylim([0.45, 0.525])

    # fig.xlabel("Min-Kimura")
    # plt.xlabel("Min-Kimura")
    # plt.ylabel("Max-Kimura")
    # fig.text(0.5, 0.04, 'Min-Kimura', ha='center')
    # fig.text(0.04, 0.5, 'Max-Kimura', va='center', rotation='vertical')
    fig.add_subplot(111, frameon=False)
    # # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel("Minimum Kimura", labelpad=20)
    plt.ylabel("Maximum Kimura", labelpad=30)

    fig.tight_layout()
    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))

    plt.show()
示例#12
0
    def visualize(mgm_mm, title="", **kwargs):
        # type: (MGMMotifModelV2, str, Dict[str, Any]) -> None

        msa_t = get_value(kwargs, "msa_t", None)
        raw_motif_data = get_value(kwargs, "raw_motif_data", None)

        num_shifts = len(mgm_mm._shift_prior.keys())

        fig = plt.figure(figsize=(14, 4 * num_shifts))
        shape = (num_shifts + 1, 5)

        # for each shift
        for s in range(num_shifts):
            # create consensus, followed by box plots
            ax_logo = plt.subplot2grid(shape, (s, 0))
            axes_box = [plt.subplot2grid(shape, (s, i)) for i in range(1, 5)]

            MGMMotifModelVisualizerV2._viz_logo(mgm_mm, ax_logo, s)

            if raw_motif_data is None:
                MGMMotifModelVisualizerV2._viz_motif_pwm(mgm_mm, axes_box, s)
            else:
                MGMMotifModelVisualizerV2._viz_motif_pwm_from_raw_data(
                    raw_motif_data[s], axes_box, mgm_mm.motif_width())

        # last row: MSA, shift prior, spacers
        ax_text = plt.subplot2grid(shape, (num_shifts, 0))
        ax_counts = plt.subplot2grid(shape, (num_shifts, 1))
        ax_pos_dist = plt.subplot2grid(shape, (num_shifts, 2))

        MGMMotifModelVisualizerV2._viz_spacer(mgm_mm, ax_pos_dist)
        MGMMotifModelVisualizerV2._viz_prior(mgm_mm, ax_counts)

        if msa_t is not None:
            MGMMotifModelVisualizerV2._viz_msa(msa_t, ax_text)

        plt.suptitle("Gc range: {}".format(title))

        plt.tight_layout()
        plt.subplots_adjust(top=0.9)
        plt.savefig(next_name("."))
        plt.show()
示例#13
0
    def visualize(mgm_mm, title="", **kwargs):
        # type: (MGMMotifModel, str, Dict[str, Any]) -> None

        msa_t = get_value(kwargs, "msa_t", None)
        raw_motif_data = get_value(kwargs, "raw_motif_data", None)

        fig = plt.figure(figsize=(10, 12))
        shape = (4, 2)

        ax1 = plt.subplot2grid(shape, (0, 0))
        ax2 = plt.subplot2grid(shape, (0, 1))
        ax3 = plt.subplot2grid(shape, (1, 0))
        ax4 = plt.subplot2grid(shape, (1, 1))
        ax_logo = plt.subplot2grid(shape, (3, 0))
        ax_counts = plt.subplot2grid(shape, (2, 0))
        ax_pos_dist = plt.subplot2grid(shape, (2, 1))
        ax_text = plt.subplot2grid(shape, (3, 1))

        axes = [ax1, ax2, ax3, ax4]  # letters

        if raw_motif_data is None:
            MGMMotifModelVisualizer._viz_motif_pwm(mgm_mm, axes)
        else:
            MGMMotifModelVisualizer._viz_motif_pwm_from_raw_data(
                raw_motif_data, axes, mgm_mm.motif_width())

        MGMMotifModelVisualizer._viz_spacer(mgm_mm, ax_pos_dist)
        MGMMotifModelVisualizer._viz_prior(mgm_mm, ax_counts)

        if msa_t is not None:
            MGMMotifModelVisualizer._viz_logo(mgm_mm, ax_logo)
            MGMMotifModelVisualizer._viz_msa(msa_t, ax_text)

        plt.suptitle("Gc range: {}".format(title))

        plt.tight_layout()
        plt.subplots_adjust(top=0.9)
        plt.savefig(next_name("."))
        plt.show()
示例#14
0
def viz_summary_per_gcfid_per_step(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env['pd-work']

    list_df = list()
    for step in ["A", "B", "C"]:
        df_summary_per_gcfid = get_summary_per_gcfid(
            df[df["Predicted-at-step"] == step])
        df_summary_per_gcfid["SBSP Step"] = step
        list_df.append(df_summary_per_gcfid)

    df_per_gcfid_per_step = pd.concat(list_df, sort=False)

    sns.catplot(df_per_gcfid_per_step,
                "Ancestor",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="SBSP Step",
                kind="box",
                legend_loc="best",
                figure_options=FigureOptions(save_fig=next_name(pd_work),
                                             xlabel="Clade",
                                             ylabel="Err(NCBI,GMS2=SBSP)"))
示例#15
0
def plot_for_5prime(df, bins=20):
    # type: (pd.DataFrame) -> None
    import matplotlib.pyplot as plt
    import seaborn

    genomes = sorted(set(df["Genome"]))
    regions = sorted(set(df["region"]))
    num_genome = len(genomes)
    import numpy as np

    fig, axes = plt.subplots(2,
                             math.ceil(num_genome / 2),
                             sharey="all",
                             sharex="all")
    axes = axes.ravel()
    for i, g in enumerate(genomes):
        ax = axes[i]

        for x in regions:
            df_group = df[(df["Genome"] == g) & (df["region"] == x)]
            # for x, df_group in df[df["Genome"] == g].groupby("region"):
            #     seaborn.distplot(df_group["score"], label=x if i == 0 else None, ax=ax,
            #                      norm_hist=False, kde=False, bins=30)

            hist_values, bin_edges = np.histogram(df_group["score"], bins=bins)
            hist_values = 100 * hist_values / sum(hist_values)

            # bin_edges = bin_edges[:len(bin_edges)-1]
            bin_edges = bin_edges[1:]
            seaborn.lineplot(bin_edges,
                             hist_values,
                             markers=False,
                             ax=ax,
                             label=x if i == 0 else None,
                             legend=False)

            # ax.hist(df_group["score"], bins=30, normed=True, histtype="step")
            # seaborn.barplot()

        ax.set_title(r"\textit{{{}}}".format(str(g)))
        ax.set_xlabel(None)
        ax.set_ylim([0, None])
        y_max = ax.get_ylim()[1]
        ax.axvline(0.5, 0, y_max, color="grey", linestyle="dashed")

    plt.subplots_adjust(bottom=0.17)
    fig.legend(loc="lower center", ncol=len(regions))

    fig.add_subplot(111, frameon=False)
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel("Score", labelpad=25)
    plt.ylabel("Percentage per group", labelpad=30)

    # figure_options = FigureOptions(
    #     xlabel="Score", ylabel="Frequency", save_fig=next_name(my_env["pd-work"])
    # )
    plt.savefig(next_name(my_env["pd-work"]))
    plt.show()
示例#16
0
def viz_per_genome(env, df):
    # type: (Environment, pd.DataFrame) -> None

    df_grp = df.groupby(["Genome", "Ancestor"], as_index=False).mean()

    sns.catplot(df_grp,
                "Ancestor",
                "BLAST",
                figure_options=FigureOptions(save_fig=next_name(
                    env["pd-work"]),
                                             xlabel="Clade",
                                             ylabel="Number of BLASTp Hits"),
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # list_grp = list()
    # for _, df_grp in df.groupby("Genome", as_index=False):
    #     indices = df_grp.index
    #
    #     list_grp.append({
    #         "Genome": df.at[indices[0], "Genome"],
    #         "Ancestor": df.at[indices[0], "Ancestor"],
    #         "= 0": len(df_grp[df_grp["BLAST"] == 0]),
    #         **{
    #             f"< {x}": len(df_grp[df_grp["BLAST"] < x]) for x in [5, 10, 20, 50, 100, 500, 1000, 5000, 10000]
    #         },
    #         "> 10000": len(df_grp[df_grp["BLAST"] > 10000])
    #     })
    #
    # df_grp = pd.DataFrame(list_grp)
    # sns.catplot(df_grp, "Ancestor", "= 0")
    # sns.catplot(df_grp, "Ancestor", "< 5")
    # sns.catplot(df_grp, "Ancestor", "< 50")
    # sns.catplot(df_grp, "Ancestor", "< 100")

    # plots
    # 1) x: number of queries with < x targets

    # compute per genome, the % of queries with hits <= 0, 5, 10, 20, 40, 80, 160, ... 240 580 1160, ...
    # plot

    list_entries = list()
    for _, df_grp in df.groupby("Genome", as_index=False):
        indices = df_grp.index
        genome = df.at[indices[0], "Genome"]
        ancestor = df.at[indices[0], "Ancestor"]

        total_queries = len(df_grp)
        curr = 0
        for n in range(40):

            list_entries.append({
                "Genome":
                genome,
                "Ancestor":
                ancestor,
                "x":
                curr,
                "y":
                100 * len(df_grp[df_grp["BLAST"] < curr]) / total_queries
            })

            # if list_entries[-1]["y"] == 100:
            #     break

            if curr == 0:
                curr = 5
            else:
                curr *= 1.2

    df_tmp = pd.DataFrame(list_entries)

    SMALL_SIZE = 16
    MEDIUM_SIZE = 22
    BIGGER_SIZE = 24
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
        'font.size': SMALL_SIZE,  # controls default text sizes
        'axes.titlesize': SMALL_SIZE,  # fontsize of the axes title
        'axes.labelsize': MEDIUM_SIZE,  # fontsize of the x and y labels
        'xtick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'ytick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'legend.fontsize': 12,  # legend fontsize
        'figure.titlesize': BIGGER_SIZE,  # fontsize of the figure title
    })

    sns.lineplot(df_tmp,
                 "x",
                 "y",
                 hue="Ancestor",
                 figure_options=FigureOptions(
                     xlabel="Number of BLASTp hits",
                     ylabel="Cumulative percentage of queries (per genome)",
                     save_fig=next_name(env["pd-work"]),
                 ),
                 legend_loc="best",
                 legend_title="",
                 legend_ncol=2,
                 sns_kwargs={
                     "ci": "sd",
                     "palette": CM.get_map("ancestor")
                 })

    sns.lineplot(df_tmp,
                 "y",
                 "x",
                 hue="Ancestor",
                 figure_options=FigureOptions(
                     ylabel="Number of BLASTp hits",
                     xlabel="Cumulative percentage of queries (per genome)",
                     save_fig=next_name(env["pd-work"]),
                 ),
                 legend_loc="best",
                 legend_title="",
                 legend_ncol=2,
                 sns_kwargs={
                     "ci": "sd",
                     "palette": CM.get_map("ancestor")
                 })

    SMALL_SIZE = 14
    MEDIUM_SIZE = 18
    BIGGER_SIZE = 20
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
        'font.size': SMALL_SIZE,  # controls default text sizes
        'axes.titlesize': SMALL_SIZE,  # fontsize of the axes title
        'axes.labelsize': MEDIUM_SIZE,  # fontsize of the x and y labels
        'xtick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'ytick.labelsize': SMALL_SIZE,  # fontsize of the tick labels
        'legend.fontsize': 12,  # legend fontsize
        'figure.titlesize': BIGGER_SIZE,  # fontsize of the figure title
    })
    fig, axes = plt.subplots(2, 2, sharex="all", sharey="all")

    ancestors = sorted(set(df["Ancestor"]))

    for anc, ax in zip(ancestors, axes.ravel()):

        df_anc = df_tmp[df_tmp["Ancestor"] == anc]
        sns.lineplot(df_anc[df_anc["x"] <= 40],
                     "x",
                     "y",
                     hue="Ancestor",
                     legend=None,
                     ax=ax,
                     sns_kwargs={
                         "ci": "sd",
                         "palette": CM.get_map("ancestor")
                     })
        ax.set_title(anc)
        ax.set_xlabel("")
        ax.set_ylabel("")

    figure_options = FigureOptions(
        xlabel="Number of BLASTp hits",
        ylabel="Cumulative percentage of\nqueries (per genome)",
        save_fig=next_name(env["pd-work"]),
    )

    fig.add_subplot(111, frameon=False)
    # # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel(figure_options.xlabel, labelpad=30)
    plt.ylabel(figure_options.ylabel, labelpad=30)

    # save_figure(figure_options, fig)
    fig.savefig(next_name(env["pd-work"]), bbox_inches="tight")
    plt.show()
示例#17
0
def viz_summary_per_gcfid_per_step(env, df):
    # type: (Environment, pd.DataFrame) -> None

    # gather analysis for steps A, A+B, and A+B+C
    list_df = list()  # type: List[pd.DataFrame]

    # compute total number of predictions per tool, per genome
    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index,
                                                      "SBSP"].sum()
        df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index,
                                                      "GMS2"].sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index,
                                                           "GMS2=SBSP"].sum()

    # loop over steps A, A+B, and A+B+C and collect stats
    tag = None
    for step in ["A", "B", "C"]:
        if tag is None:
            tag = step
        else:
            tag += "+" + step
        df_summary_per_gcfid = get_summary_per_gcfid(
            df[df["Predicted-at-step"] <= step])
        df_summary_per_gcfid["SBSP Step"] = tag
        list_df.append(df_summary_per_gcfid)

    df_per_gcfid_per_step = pd.concat(list_df, sort=False)

    import matplotlib.pyplot as plt
    # fig, ax = plt.subplots()
    #
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax,
    #              sns_kwargs={"palette": CM.get_map("verified")},
    #              legend=False
    #              )
    # for l in ax.lines:
    #     l.set_linestyle("--")
    #
    # ax2 = ax.twinx()
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2,
    #              sns_kwargs={"palette": CM.get_map("verified")},)
    #
    # fo = FigureOptions(
    #     xlabel="SBSP Step",
    #     ylabel="Percentage",
    #     # ylim=[0, 105],
    #     save_fig=next_name(env["pd-work"])
    # )
    # FigureOptions.set_properties_for_axis(ax, fo)
    # plt.subplots_adjust(bottom=0.2)
    # handles, labels = ax.get_legend_handles_labels()
    # ax.legend(handles=handles[1:], labels=labels[1:],
    #           loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25))
    #
    # plt.savefig(fo.save_fig)
    # plt.show()

    fig, axes = plt.subplots(3, 2, sharex="all", sharey="row")
    ax = axes[:, 0]

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Error rate (\%)",
                     ylim=[0, 20],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percentage\nof Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Number\nof Genes",
                                              ylim=[0, None]))

    fig.align_ylabels(ax)

    # plt.savefig(next_name(env["pd-work"]))
    # plt.show()

    # fig, ax = plt.subplots(3, 1, sharex="all")
    ax = axes[:, 1]
    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Error",
                     ylim=[0, None],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percentage of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "GMS2=SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    ax[2].get_legend().remove()

    fig.align_ylabels(ax)

    for ax in axes.ravel():
        ax.set_xlabel("Steps")

    axes[0][0].set_title(TOOL)
    axes[0][1].set_title(TOOLp)

    fig.subplots_adjust(bottom=0.21)

    # handles, labels = ax.get_legend_handles_labels()
    # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25))
    handles, labels = ax.get_legend_handles_labels()
    labels[0] = "Genome"
    fig.legend(handles=handles, labels=labels, loc="lower center",
               ncol=3)  #, bbox_to_anchor=(0.5, -0.25))
    plt.savefig(next_name(env["pd-work"]))
    plt.show()

    # three plots

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index,
               "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index,
               "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) &
                                                     (df_group["NCBI"])).sum()

    df_all = get_summary_per_gcfid(df)

    # map column names for tables
    columns = [
        "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)",
        "Sen(GMS2=SBSP,NCBI)", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)",
        "Cov2(GMS2=SBSP,NCBI)"
    ]
    df_sen = df_all.copy()[columns].rename(columns={
        "GCFID": "Genome",
        "NCBI": "Verified",
        "Sen(SBSP,NCBI)": "SBSP",
        "Sen(GMS2,NCBI)": "GMS2",
        "Sen(GMS2=SBSP,NCBI)": "GMS2=SBSP",
    },
                                           inplace=False)
    df_sen[["Genome", "Verified", "SBSP", "GMS2",
            "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "sensitivity.csv"),
                                 index=False)

    # print(df_all[["GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)", "Cov2(GMS2=SBSP,NCBI)"]].to_string(index=False))

    df_cov = df_all[columns].rename(columns={
        "GCFID": "Genome",
        "NCBI": "Verified",
        "Cov2(SBSP,NCBI)": "SBSP",
        "Cov2(GMS2,NCBI)": "GMS2",
        "Cov2(GMS2=SBSP,NCBI)": "GMS2=SBSP",
    },
                                    inplace=False)

    df_cov[["Genome", "Verified", "SBSP", "GMS2",
            "GMS2=SBSP"]].to_csv(os_join(env["pd-work"], "coverage.csv"),
                                 index=False)
def one_dim_Kimura_accuracy(env, df_all, num_steps=20):
    # type: (Environment, pd.DataFrame, int) -> None
    import matplotlib.pyplot as plt
    pd_work = env["pd-work"]
    ancestors = sorted(list(set(df_all["Ancestor"])))
    # fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True)

    # min_x = min(df_all["Average-Kimura"])
    # max_x = max(df_all["Average-Kimura"]) + 0.000000001
    # ss_x = (max_x - min_x) / float(num_steps)
    #
    # list_df = list()
    # axis_idx = 0
    # for ancestor, df in df_all.groupby("Ancestor", as_index=False):
    #     # ax = axes.ravel()[axis_idx]
    #     # axis_idx += 1
    #
    #
    #
    #
    #
    #     import numpy as np
    #     gms2_eq_sbsp_and_ncbi = np.zeros(num_steps, dtype=float)
    #     gms2_eq_sbsp_eq_ncbi = np.zeros(num_steps, dtype=float)
    #
    #     df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"])
    #     df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"])
    #
    #     for index in df.index:
    #
    #         x_val = df.at[index, "Average-Kimura"]
    #
    #         x_pos = int((x_val-min_x) / ss_x)
    #
    #         gms2_eq_sbsp_and_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0
    #         gms2_eq_sbsp_eq_ncbi[x_pos] += 1 if df.at[index, "GMS2=SBSP=NCBI"] else 0
    #
    #     accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi)
    #     # accuracy = np.flip(accuracy, 0)
    #
    #
    #     xticks = list(range(0, num_steps))
    #
    #     l_x = np.arange(min_x, max_x, ss_x)
    #     xticklabels = [round(l_x[i], 2) for i in xticks]
    #     # g = seaborn.heatmap(accuracy.transpose(), vmin=0, vmax=1, xticklabels=xticklabels, yticklabels=yticklabels, ax=ax,
    #     #                     cbar=True)
    #
    #     # g = seaborn.lineplot(xticklabels, accuracy, ax=ax, label=ancestor)
    #
    #     # cbar=g.cbar
    #
    #     # g.set_xticks(xticks)
    #
    #     curr_df = pd.DataFrame({
    #         "Average-Kimura": xticklabels,
    #         "Accuracy": accuracy,
    #         "Number-of-queries": gms2_eq_sbsp_and_ncbi
    #     })
    #     curr_df["Ancestor"] = ancestor
    #     list_df.append(curr_df)
    #
    #     # g.set_xlabel("Min Kimura")
    #     # g.set_ylabel("Max Kimura")
    #     # g.set_title(ancestor)
    #
    # df = pd.concat(list_df)     # type: pd.DataFrame
    df = bin_data_one_d(env, df_all, "Average-Kimura", num_steps)
    sns.lineplot(df,
                 "Average-Kimura",
                 "Accuracy",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
    sns.lineplot(df,
                 "Average-Kimura",
                 "Number-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    total_per_ancestor = {
        ancestor: (df["Ancestor"].isin({ancestor})).sum()
        for ancestor in ancestors
    }

    df["Percentage-of-queries"] = 0
    df["Cumulative-percentage-of-queries"] = 0
    df.reset_index(inplace=True)
    for ancestor, df_group in df.groupby(
            "Ancestor", as_index=False):  # type: str, pd.DataFrame
        df_group.sort_values("Average-Kimura", inplace=True)
        index = df_group.index

        prev = 0
        total = df_group["Number-of-queries"].sum()
        df.loc[index, "Percentage-of-queries"] = 100 * df.loc[
            index, "Number-of-queries"] / float(total)

        for i in index:
            df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[
                i, "Percentage-of-queries"]
            prev = df.loc[i, "Cumulative-percentage-of-queries"]

    fig, ax = plt.subplots(figsize=(8, 4))
    sns.lineplot(df,
                 "Average-Kimura",
                 "Percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work),
                                              ylabel="Percentage of queries",
                                              xlabel="Average Kimura"),
                 ax=ax,
                 show=True,
                 legend_loc="best",
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.lineplot(df,
                 "Average-Kimura",
                 "Cumulative-percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    # standard dev
    df = bin_data_one_d(env, df_all[df_all["Support"] > 2], "Std-Kimura",
                        num_steps)
    sns.lineplot(df,
                 "Std-Kimura",
                 "Accuracy",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
    sns.lineplot(df,
                 "Std-Kimura",
                 "Number-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    total_per_ancestor = {
        ancestor: (df["Ancestor"].isin({ancestor})).sum()
        for ancestor in ancestors
    }

    df["Percentage-of-queries"] = 0
    df["Cumulative-percentage-of-queries"] = 0
    df.reset_index(inplace=True)
    for ancestor, df_group in df.groupby(
            "Ancestor", as_index=False):  # type: str, pd.DataFrame
        df_group.sort_values("Std-Kimura", inplace=True)
        index = df_group.index

        prev = 0
        total = df_group["Number-of-queries"].sum()
        df.loc[index, "Percentage-of-queries"] = 100 * df.loc[
            index, "Number-of-queries"] / float(total)

        for i in index:
            df.loc[i, "Cumulative-percentage-of-queries"] = prev + df.loc[
                i, "Percentage-of-queries"]
            prev = df.loc[i, "Cumulative-percentage-of-queries"]
    sns.lineplot(df,
                 "Std-Kimura",
                 "Percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.lineplot(df,
                 "Std-Kimura",
                 "Cumulative-percentage-of-queries",
                 hue="Ancestor",
                 figure_options=FigureOptions(save_fig=next_name(pd_work), ),
                 sns_kwargs={"palette": CM.get_map("ancestor")})
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    prl_options = ParallelizationOptions.init_from_dict(env, vars(args))

    if not prl_options["use-pbs"]:
        df = relative_entropy_analysis(env, gil, prl_options)
    else:
        pbs = PBS(env,
                  prl_options,
                  splitter=split_genome_info_list,
                  merger=merge_identity)
        list_df = pbs.run(data={"gil": gil},
                          func=relative_entropy_analysis,
                          func_kwargs={
                              "env": env,
                              "prl_options": prl_options
                          })
        df = pd.concat(list_df, ignore_index=True, sort=False)

    df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False)

    pd_figures = os_join(env["pd-work"], "summary_figures")
    mkdir_p(pd_figures)

    sns.scatterplot(df,
                    "Percent",
                    "Error",
                    figure_options=FigureOptions(
                        ylim=[0, 20], save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Motif",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Spacer",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.scatterplot(
        df,
        "RE Motif",
        "RE Spacer",
        hue="Genome",
        identity=True,
        figure_options=FigureOptions(save_fig=next_name(pd_figures)))

    sns.lmplot(df,
               "Percent",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Motif",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Spacer",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "Percent",
               "RE",
               hue="Genome",
               figure_options=FigureOptions(save_fig=next_name(pd_figures)))
def plot_letter_over_position(env, df, col, title=""):
    # type: (Environment, pd.DataFrame, str, str) -> None

    collect = dict()
    array, update_shifts = create_numpy_for_column_with_extended_motif(
        env, df, col, collect)
    df_original = df
    binned_arrays = [{
        "GC": df["GC"],
        "motifs": array,
        "shifts": update_shifts
    }]

    example = df.at[df.index[0], col]  # type: Dict[str, List[float]]
    w = len(next(iter(example.values())))  # width (numbere of positions)
    b = len(example)  # number of bases (letters)

    letters = example.keys()
    letter_to_idx = {x: x_pos for x_pos, x in enumerate(sorted(letters))}

    # fig, axes = plt.subplots(2, math.ceil(len(letters) / 2), sharex="all", sharey="all")
    fig = plt.figure(figsize=(10, 12))
    shape = (4, 2)

    ax1 = plt.subplot2grid(shape, (0, 0))
    ax2 = plt.subplot2grid(shape, (0, 1))
    ax3 = plt.subplot2grid(shape, (1, 0))
    ax4 = plt.subplot2grid(shape, (1, 1))
    ax_logo = plt.subplot2grid(shape, (3, 0))
    ax_counts = plt.subplot2grid(shape, (2, 0))
    ax_pos_dist = plt.subplot2grid(shape, (2, 1))
    ax_text = plt.subplot2grid(shape, (3, 1))

    axes = [ax1, ax2, ax3, ax4]

    # for each letter
    # for l, ax in zip(letters, axes.ravel()[:len(letters)]):
    ylim = [-0.1, 1.1]
    for l, ax in zip(letters, axes):
        # for each position in motif
        # go through df and accumulate values
        all_gc = list()
        all_probs = list()
        for w_pos in range(array.shape[1]):

            for ba in binned_arrays:
                arr = ba["motifs"]
                gc = ba["GC"].values
                shifts = ba["shifts"]

                for index in range(len(shifts)):

                    shifted_position = w_pos
                    # print(w_pos, shifted_position)

                    # shifted_pos = w_pos - shifts[index]
                    # if shifted_pos < 0 or shifted_pos >= w:
                    #     continue
                    if w_pos < shifts[index] or w_pos >= shifts[index] + 6:
                        continue

                    all_gc.append(shifted_position)

                    if arr[index, shifted_position,
                           letter_to_idx[l]] < 0 or arr[index,
                                                        shifted_position,
                                                        letter_to_idx[l]] > 1:
                        raise ValueError("Something's up")
                    all_probs.append(arr[index, shifted_position,
                                         letter_to_idx[l]])

            # ax.scatter(all_gc, all_probs, marker="+")
            # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3})
        ax.set_title(f"{l}")

        df = pd.DataFrame({"Position": all_gc, "Probability": all_probs})
        df.sort_values("Position", inplace=True)

        # seaborn.kdeplot(df["Position"], df["Probability"], cmap="Reds", ax=ax)

        df_mean = df.groupby("Position", as_index=False).mean()
        seaborn.boxplot("Position",
                        "Probability",
                        data=df,
                        ax=ax,
                        color="red",
                        fliersize=0)
        seaborn.lineplot(df_mean["Position"],
                         df_mean["Probability"],
                         ax=ax,
                         color="blue")
        ax.set_ylim(ylim)
        # loess_with_stde(df, "Position", "Probability", ax, None)

        # plt.show()

    # add logo
    ax = ax_logo
    msa_t = collect["msa_t"]
    seqs = [x.seq._data for x in msa_t.list_alignment_sequences]
    counts_mat = lm.alignment_to_matrix(sequences=seqs,
                                        to_type='counts',
                                        characters_to_ignore='.-X')

    # Counts matrix -> Information matrix
    info_mat = lm.transform_matrix(counts_mat,
                                   from_type='counts',
                                   to_type='information')

    lm.Logo(info_mat, ax=ax, color_scheme="classic")
    ax.set_ylim([0, 2])

    # add distplot of starting positions
    ax = ax_counts
    # seaborn.distplot(update_shifts, ax=ax)
    counter = Counter(update_shifts)
    total = sum(counter.values())
    to_add = sorted(set(range(4)).difference(counter.keys()))
    normalized = [[x, 100 * counter[x] / total]
                  for x in counter] + [[x, 0] for x in to_add]
    normalized = np.array(normalized)
    seaborn.barplot(normalized[:, 0], normalized[:, 1], ax=ax, color="blue")
    ax.set_ylim([0, 100])
    ax.set_ylabel("Probability")
    ax.set_xlabel("Shift in consensus")

    ### Plot position distribution
    col_pos = col.replace("_MAT", "_POS_DISTR")
    ax = ax_pos_dist
    shift_to_pos_dist = get_position_distributions_by_shift(
        df_original, col_pos, update_shifts)
    for s in sorted(shift_to_pos_dist.keys()):
        list_pos_dist = shift_to_pos_dist[s]

        # average positions
        values = dict()
        for l in list_pos_dist:
            try:
                for i in l.keys():
                    if i not in values.keys():
                        values[i] = list()
                    values[i].append(l[i])
            except Exception:
                continue
        for i in values.keys():
            values[i] = np.mean(values[i])

        total = sum(values.values())
        for i in values.keys():
            values[i] /= total

        x = sorted(values.keys())
        y = [values[a] for a in x]

        seaborn.lineplot(x, y, label=s, ax=ax)

    ax.legend()

    # TEXT
    ax = ax_text
    from matplotlib.font_manager import FontProperties
    fp = FontProperties()
    fp.set_family("monospace")
    print("here")
    print(print_reduced_msa(msa_t, True, n=10))
    ax.text(0,
            0,
            print_reduced_msa(msa_t, True, n=10),
            horizontalalignment='left',
            verticalalignment='center',
            fontproperties=fp)
    ax.set_xlim([-0.2, 0.4])
    ax.set_ylim([-0.4, 0.4])
    # ax.axis("off",)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    plt.suptitle("Gc range: {}. Num Data points: {}".format(
        title, msa_t.number_of_sequences()))
    # save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    plt.savefig(next_name(env["pd-work"]))
    plt.show()
def heat_map_Kimura_accuracy(env, df_all, x, y, num_steps=20, balance=False):
    # type: (Environment, pd.DataFrame, str, str, int) -> None
    import matplotlib.pyplot as plt

    ancestors = sorted(list(set(df_all["Ancestor"])))
    fig, axes = plt.subplots(2,
                             math.ceil(len(ancestors) / 2),
                             sharex=True,
                             sharey=True)
    cbar_ax = fig.add_axes([.91, .3, .03, .4])

    # fig = plt.figure()
    num_rows = 2
    num_cols = math.ceil(len(ancestors) / 2)

    axis_idx = 0
    curr_row = 0
    curr_col = 0
    for ancestor, df in df_all.groupby("Ancestor", as_index=False):
        ax = axes.ravel()[axis_idx]
        # ax = plt.subplot2grid((num_rows, num_cols), (curr_row, curr_col))
        axis_idx += 1
        curr_col += 1
        if curr_col == math.ceil(len(ancestors) / 2):
            curr_row += 1
            curr_col = 0

        min_x = min(df[x])
        max_x = max(df[x]) + 0.000000001

        min_y = min(df[y])
        max_y = max(df[y]) + 0.000000001

        if balance:
            min_x = min_y = min(min_x, min_y)
            max_x = max_y = max(max_x, max_y)

        ss_x = (max_x - min_x) / float(num_steps)
        ss_y = (max_y - min_y) / float(num_steps)

        num_col = num_steps
        num_row = num_steps
        import numpy as np
        gms2_eq_sbsp_and_ncbi = np.zeros([num_row, num_col], dtype=float)
        gms2_eq_sbsp_eq_ncbi = np.zeros([num_row, num_col], dtype=float)

        df_gms2_eq_sbsp_and_ncbi = (df["GMS2=SBSP"]) & (df["NCBI"])
        df_gms2_eq_sbsp_eq_ncbi = (df["GMS2=SBSP=NCBI"])

        for index in df.index:

            x_val = df.at[index, x]
            y_val = df.at[index, y]

            x_pos = int((x_val - min_x) / ss_x)
            y_pos = int((y_val - min_y) / ss_y)

            gms2_eq_sbsp_and_ncbi[x_pos][y_pos] += 1 if df.at[
                index, "GMS2=SBSP"] and df.at[index, "NCBI"] else 0
            gms2_eq_sbsp_eq_ncbi[x_pos][y_pos] += 1 if df.at[
                index, "GMS2=SBSP=NCBI"] else 0

        gms2_eq_sbsp_and_ncbi[gms2_eq_sbsp_and_ncbi < 10] = 0
        accuracy = np.divide(gms2_eq_sbsp_eq_ncbi, gms2_eq_sbsp_and_ncbi)
        # accuracy = np.flip(accuracy, 0)

        import seaborn
        import matplotlib.pyplot as plt

        xticks = list(range(0, num_steps, int(num_steps / 5)))
        yticks = list(range(0, num_steps, int(num_steps / 5)))

        l_x = np.arange(min_x, max_x, ss_x)
        l_y = np.arange(min_y, max_y, ss_y)
        xticklabels = [round(l_x[i], 2) for i in xticks]
        yticklabels = [round(l_y[i], 2) for i in yticks]
        g = seaborn.heatmap(accuracy.transpose(),
                            vmin=0,
                            vmax=1,
                            xticklabels=xticklabels,
                            yticklabels=yticklabels,
                            ax=ax,
                            cbar=False)
        # cbar_ax=None if axis_idx != 0 else cbar_ax, cbar=axis_idx==0)

        # cbar=g.cbar

        g.invert_yaxis()
        g.set_xticks(xticks)
        g.set_yticks(yticks)
        g.set_xticklabels(xticklabels, rotation=0)

        # g.set_xlabel("Min Kimura")
        # g.set_ylabel("Max Kimura")
        g.set_title(ancestor)
        mappable = ax.collections[0]

    # im = plt.gca().get_children()[0]
    # cax = fig.add_axes([0.8, 0.1, 0.03, 0.8])
    cbar_ax = fig.axes[-1]

    # fig.tight_layout(rect=[0, 0, .9, 1])
    fig.add_subplot(111, frameon=False)
    # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel(x, labelpad=20)
    plt.ylabel(y, labelpad=30)

    # ax3 = plt.subplot2grid((num_rows, num_cols), (0, num_cols - 1), rowspan=num_rows,
    #                        )
    plt.colorbar(mappable, cax=cbar_ax)
    fig.tight_layout(rect=[0, 0, .9, 1])

    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))

    plt.show()
def analyze_independent_predictions(max_candidates, sen_a, sen_b):
    # type: (int, float, float) -> None

    sensitivities = {
        "Random": sensitivity_random,
        "Independent": sensitivity_independent,
        "Fully dependent": sensitivity_fully_dependent
    }

    agree_given_pred = {
        "Random": agree_given_pred_random,
        "Independent": agree_given_pred_independent,
        "Fully dependent": agree_given_pred_fully_dependent
    }

    df = compute_data(sensitivities, agree_given_pred, max_candidates)

    plot_sensitivities_vs_num_candidates(sensitivities, max_candidates, sen_a,
                                         sen_b)

    sns.lineplot(
        df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)],
        "Number of candidates",
        "Probability",
        hue="Condition",
        sns_kwargs={"palette": CM.get_map("independence-conditions")},
        legend_loc="best",
        figure_options=FigureOptions(
            save_fig=next_name("."),
            ylabel=r"$P(y=s|x_1=y, x_2=y)$",
            # xlim=[None, 40]
        ))
    # error
    df["1 - Probability"] = 1 - df["Probability"]
    sns.lineplot(
        df[(df["Sensitivity A"] == 0.9) & (df["Sensitivity B"] == 0.9)],
        "Number of candidates",
        "1 - Probability",
        hue="Condition",
        sns_kwargs={"palette": CM.get_map("independence-conditions")},
        legend_loc="best",
        figure_options=FigureOptions(
            save_fig=next_name("."),
            ylabel=r"$P(y\neq s|x_1=y, x_2=y)$",
            # xlim=[None, 40]
        ))

    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4))

    sns.lineplot(df[(df["Sensitivity A"] == 0.9)
                    & (df["Sensitivity B"] == 0.9)],
                 "Number of candidates",
                 "Probability",
                 hue="Condition",
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 ax=axes[0],
                 legend=False,
                 figure_options=FigureOptions(title="Sensitivity = 0.9", ))

    sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"])
                    & (df["Number of candidates"] == 25)],
                 "Sensitivity A",
                 "Probability",
                 hue="Condition",
                 ax=axes[1],
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 figure_options=FigureOptions(
                     ylim=[0, 1.05],
                     xlim=[0, 1],
                     xlabel="Sensitivity",
                     title="Number of candidates = 25",
                 ))

    save_figure(FigureOptions(save_fig=next_name(".")), fig)
    plt.show()

    df_tmp = df[(df["Sensitivity A"] == df["Sensitivity B"])
                & (df["Condition"] == "Independent") &
                (df["Sensitivity A"].isin(
                    {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}))]
    df_tmp.rename(columns={"Sensitivity A": "Sensitivity"}, inplace=True)

    sns.lineplot(
        df_tmp,
        "Number of candidates",
        "Probability",
        hue="Sensitivity",
        figure_options=FigureOptions(
            # ylim=[0, 1.05],
            # xlim=[0, 1],
            title="Independent algorithms",
            save_fig=next_name(".")),
    )

    # for condition in set(df["Condition"]):
    #
    #     sns.kdeplot(
    #         df[(df["Condition"] == condition) & (df["Sensitivity A"] == df["Sensitivity B"])],
    #         "Sensitivity A", "Number of candidates", "Probability",
    #         figure_options=FigureOptions(
    #             title=condition
    #         ))
    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2, sharey="all", figsize=(10, 4))

    sns.lineplot(df[(df["Sensitivity A"] == 0.9)
                    & (df["Sensitivity B"] == 0.9)],
                 "Number of candidates",
                 "Agree given prediction",
                 hue="Condition",
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 ax=axes[0],
                 legend=False,
                 figure_options=FigureOptions(title="Sensitivity = 0.9", ))

    sns.lineplot(df[(df["Sensitivity A"] == df["Sensitivity B"])
                    & (df["Number of candidates"] == 25)],
                 "Sensitivity A",
                 "Agree given prediction",
                 hue="Condition",
                 ax=axes[1],
                 sns_kwargs={"palette": CM.get_map("independence-conditions")},
                 figure_options=FigureOptions(
                     ylim=[0, 1.05],
                     xlim=[0, 1],
                     xlabel="Sensitivity",
                     title="Number of targets = 25",
                 ))

    save_figure(FigureOptions(save_fig=next_name(".")), fig)
    plt.show()
def viz_summary_per_gcfid(env, df, title=None):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env['pd-work']
    sns.catplot(df,
                "Ancestor",
                "GMS2=SBSP % SBSP",
                kind="box",
                figure_options=FigureOptions(
                    save_fig=next_name(pd_work),
                    ylim=[None, 100],
                    title=title,
                ),
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df,
                "Ancestor",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                kind="box",
                figure_options=FigureOptions(save_fig=next_name(pd_work),
                                             ylim=[0, 20],
                                             ylabel="1 - Sen(NCBI, GMS2=SBSP)",
                                             xlabel="Clade",
                                             title=title),
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # per GC
    sns.scatterplot(df,
                    "Genome GC",
                    "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                    hue="Ancestor",
                    figure_options=FigureOptions(
                        save_fig=next_name(pd_work),
                        ylim=[0, None],
                        title=title,
                    ),
                    legend_loc="best",
                    sns_kwargs={"palette": CM.get_map("ancestor")})

    # per GC
    sns.lmplot(df,
               "Genome GC",
               "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[0, None],
                   title=title,
                   ylabel="1 - Sen(NCBI, GMS2=SBSP)",
               ),
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": False,
                   "lowess": True
               })

    sns.lmplot(df,
               "Genome GC",
               "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[0, None],
                   title=title,
                   ylabel="1 - Sen(NCBI, GMS2=SBSP)",
               ),
               legend_loc="best",
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   },
                   "aspect": 1.5
               })

    sns.lmplot(df,
               "Genome GC",
               "GMS2=SBSP",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[0, None],
                   title=title,
               ),
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   }
               })

    sns.lmplot(df,
               "Genome GC",
               "GMS2=SBSP % SBSP",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[50, 100],
                   title=title,
               ),
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   }
               })

    sns.lmplot(df,
               "Genome GC",
               "GMS2=SBSP % GMS2",
               hue="Ancestor",
               figure_options=FigureOptions(
                   save_fig=next_name(pd_work),
                   ylim=[50, 100],
                   title=title,
               ),
               sns_kwargs={
                   "palette": CM.get_map("ancestor"),
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   }
               })

    sns.scatterplot(df,
                    "NCBI",
                    "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                    hue="Ancestor",
                    figure_options=FigureOptions(
                        save_fig=next_name(pd_work),
                        ylim=[0, None],
                        title=title,
                    ),
                    sns_kwargs={
                        "palette": CM.get_map("ancestor"),
                    })

    sns.scatterplot(df,
                    "GMS2=SBSP",
                    "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                    hue="Ancestor",
                    figure_options=FigureOptions(
                        save_fig=next_name(pd_work),
                        ylim=[0, None],
                        title=title,
                    ),
                    sns_kwargs={
                        "palette": CM.get_map("ancestor"),
                    })

    # per GC
    sns.scatterplot(df,
                    "Genome GC",
                    "(GMS2=SBSP)!=Prodigal % GMS2=SBSP",
                    hue="Ancestor",
                    figure_options=FigureOptions(
                        save_fig=next_name(pd_work),
                        ylim=[0, None],
                        title=title,
                    ),
                    sns_kwargs={"palette": CM.get_map("ancestor")})
def viz_summary_per_gcfid_per_step(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env['pd-work']

    list_df = list()

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index, "Total SBSP"] = df.loc[df_group.index,
                                                      "SBSP"].sum()
        df.loc[df_group.index, "Total GMS2"] = df.loc[df_group.index,
                                                      "GMS2"].sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = df.loc[df_group.index,
                                                           "GMS2=SBSP"].sum()

    tag = None
    for step in ["A", "B", "C"]:
        if tag is None:
            tag = step
        else:
            tag += "+" + step
        df_summary_per_gcfid = get_summary_per_gcfid(
            df[df["Predicted-at-step"] <= step])
        df_summary_per_gcfid["SBSP Step"] = tag
        list_df.append(df_summary_per_gcfid)

    df_per_gcfid_per_step = pd.concat(list_df, sort=False)

    import matplotlib.pyplot as plt
    # fig, ax = plt.subplots()
    #
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "SBSP", hue="GCFID", ax=ax,
    #              sns_kwargs={"palette": CM.get_map("verified")},
    #              legend=False
    #              )
    # for l in ax.lines:
    #     l.set_linestyle("--")
    #
    # ax2 = ax.twinx()
    # sns.lineplot(df_per_gcfid_per_step, "SBSP Step", "Sen(SBSP,NCBI)", hue="GCFID", ax=ax2,
    #              sns_kwargs={"palette": CM.get_map("verified")},)
    #
    # fo = FigureOptions(
    #     xlabel="SBSP Step",
    #     ylabel="Percentage",
    #     # ylim=[0, 105],
    #     save_fig=next_name(env["pd-work"])
    # )
    # FigureOptions.set_properties_for_axis(ax, fo)
    # plt.subplots_adjust(bottom=0.2)
    # handles, labels = ax.get_legend_handles_labels()
    # ax.legend(handles=handles[1:], labels=labels[1:],
    #           loc="lower center", ncol=4, bbox_to_anchor=(0.5, -0.25))
    #
    # plt.savefig(fo.save_fig)
    # plt.show()

    fig, axes = plt.subplots(3, 2, sharex="all", sharey="row")
    ax = axes[:, 0]

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Sensitivity",
                     ylim=[85, 105],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percent of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    fig.align_ylabels(ax)

    # plt.savefig(next_name(env["pd-work"]))
    # plt.show()

    # fig, ax = plt.subplots(3, 1, sharex="all")
    ax = axes[:, 1]
    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Sen(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[0],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(
                     ylabel="Sensitivity",
                     ylim=[85, 105],
                 ))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "Cov(GMS2=SBSP,NCBI)",
                 hue="GCFID",
                 ax=ax[1],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend=False,
                 figure_options=FigureOptions(ylabel="Percent of Genes",
                                              ylim=[0, None]))

    sns.lineplot(df_per_gcfid_per_step,
                 "SBSP Step",
                 "GMS2=SBSP",
                 hue="GCFID",
                 ax=ax[2],
                 sns_kwargs={"palette": CM.get_map("verified")},
                 figure_options=FigureOptions(ylabel="Number of Genes",
                                              ylim=[0, None]))

    ax[2].get_legend().remove()

    fig.align_ylabels(ax)

    for ax in axes.ravel():
        ax.set_xlabel("Steps")

    axes[0][0].set_title("SBSP")
    axes[0][1].set_title("GMS2=SBSP")

    fig.subplots_adjust(bottom=0.21)

    # handles, labels = ax.get_legend_handles_labels()
    # fig.legend(handles=handles[1:], labels=labels[1:], loc="lower center", ncol=4)#, bbox_to_anchor=(0.5, -0.25))
    handles, labels = ax.get_legend_handles_labels()
    labels[0] = "Genome"
    fig.legend(handles=handles, labels=labels, loc="lower center",
               ncol=3)  #, bbox_to_anchor=(0.5, -0.25))
    plt.savefig(next_name(env["pd-work"]))
    plt.show()

    # three plots

    for gcfid, df_group in df.groupby("GCFID", as_index=False):
        df.loc[df_group.index,
               "Total SBSP"] = ((df_group["SBSP"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index,
               "Total GMS2"] = ((df_group["GMS2"]) & (df_group["NCBI"])).sum()
        df.loc[df_group.index, "Total GMS2=SBSP"] = ((df_group["GMS2=SBSP"]) &
                                                     (df_group["NCBI"])).sum()

    df_all = get_summary_per_gcfid(df)

    print(df_all[[
        "GCFID", "NCBI", "Sen(SBSP,NCBI)", "Sen(GMS2,NCBI)",
        "Sen(GMS2=SBSP,NCBI)"
    ]].to_string(index=False))

    print(df_all[[
        "GCFID", "NCBI", "Cov2(SBSP,NCBI)", "Cov2(GMS2,NCBI)",
        "Cov2(GMS2=SBSP,NCBI)"
    ]].to_string(index=False))

    import sys
    sys.exit()
示例#25
0
def plot_per_tool_by_genome_type(env, df):
    # type: (Environment, pd.DataFrame) -> None

    list_tags = get_tags_for_5prime(df)

    num_tags = len(list_tags)

    fig, ax = plt.subplots(2,
                           math.ceil(num_tags / 2),
                           sharey="all",
                           sharex="all")
    fig.add_axes([.91, .3, .03, .4])
    cbar_ax = fig.axes[-1]
    #
    # save_figure(FigureOptions(
    #     save_fig=next_name(env["pd-work"])
    #         ), fig)
    #
    # plt.show()
    # return

    import numpy as np
    kws = {
        # "levels": np.arange(0, 1, 0.2),
        # "vmin": 0, "vmax": 0.55,
        # "norm": True
        "xlim": [0.2, 0.8],
        "ylim": [0, 35],
        "cbar_max": 1,
        "num_steps": 35,
    }

    cbar_enable = {
        "cbar_ax": cbar_ax,
        "cbar": True,
    }

    counter = 0
    for tag, c, a in zip(list_tags, ["b", "g", "r", "o"], ax.ravel()):
        x, y, y_l, y_u = loess_with_stde(
            df, "GC", f"M:{tag}", a, tag.replace("=", ","), **kws,
            **cbar_enable if counter == 0 else dict())

        a.set_title(
            tag.replace("=",
                        ",").replace("NCBI",
                                     "PGAP").replace("GMS2", "GeneMarkS-2"))
        a.set_ylabel("")
        a.set_xlabel("")
        # a.set_ylim([65,100])
        # a.set_ylim([0, 35])
        # eps_x = [z for z in a.get_ylim()]
        # eps_x[0] -= 0.01
        # eps_x[1] += 0.01
        #
        # a.set_xlim(eps_x)
        # if counter % 2 == 0:
        #     a.set_ylabel("Percentage of gene-start differences")
        # if counter >= math.ceil(num_tags/2):
        #     a.set_xlabel("GC")
        counter += 1

        mappable = a.collections[0]

    # plt.legend(loc="best")
    figure_options = FigureOptions(save_fig=next_name(env["pd-work"]))
    fig.add_subplot(111, frameon=False)
    # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel("GC", labelpad=30)
    plt.ylabel("Percentage of gene-start differences", labelpad=30)
    # plt.xlabel("GC")
    # plt.ylabel("Percent 5' Match")

    # mappable=create_mappable_for_colorbar(np.arange(0, 0.4, 0.05), "Reds")
    # plt.colorbar(mappable, cax=cbar_ax, cmap="Reds")
    fig.tight_layout(rect=[-0.02, -0.02, .9, 1])

    # plt.tight_layout()
    # FigureOptions.set_properties_for_axis(ax, figure_options)

    save_figure(figure_options, fig)
    plt.show()
    #
    # for tag in list_tags:
    #     sns.jointplot(df, "GC", f"M:{tag}")
    #
    #
    # x = df["GC"].values
    # y = df[f"M:{list_tags[0]}"].values
    # order = np.argsort(x)
    # # run it
    # y_sm, y_std = lowess(x, y, f=1. / 5.)
    # # plot it
    # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS')
    # plt.fill_between(x[order], y_sm[order] - 1.96 * y_std[order],
    #                  y_sm[order] + 1.96 * y_std[order], alpha=0.3, label='LOWESS uncertainty')
    # # plt.plot(x, y, 'k.', label='Observations')
    # # plt.legend(loc='best')
    # # run it
    # y_sm, y_std = lowess(x, y, f=1. / 5.)
    # # plot it
    # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS')
    # plt.fill_between(x[order], y_sm[order] - y_std[order],
    #                  y_sm[order] + y_std[order], alpha=0.3, label='LOWESS uncertainty')
    # # plt.plot(x, y, 'k.', label='Observations')
    # plt.legend(loc='best')
    # plt.show()

    # calculate a 60 day rolling mean and plot
    # calculate a 60 day rolling mean and plot

    # df_stacked = stack_columns_as_rows(
    #     df, [f"M:{tag}" for tag in list_tags], "Percent 5p Match", [f"M:{tag}" for tag in list_tags], "Tools"
    # )
    #
    #
    # sns.lmplot(
    #     df_stacked, "GC", "Percent 5p Match", hue="Tools",
    #     figure_options=FigureOptions(
    #         xlabel="Genome GC",
    #         ylim=[70, 100]
    #     ),
    #     legend_loc="best",
    #     sns_kwargs={"scatter_kws": {"s": 5, "alpha": 0.3}, "lowess": False, "scatter": False, "aspect": 1.5}
    # )
    # # sns.tsplot(df_stacked, "GC", "Percent 5p Match", hue="Tools", sns_kwargs={"ci":"sd"})
    # fig, ax = plt.subplots(1, 1)
    # seaborn.lineplot(df["GC"], df[f"M:{list_tags[0]}"])
    # # seaborn.tsplot(df, "GC", f"M:{list_tags[0]}" , ci="sd")
    # plt.show()

    plt.show()
def visualize_matrix_column(env, df, col):
    # type: (Environment, pd.DataFrame, str) -> None

    # first, remove all NA for column
    df = df[~df[col].isna()]  # we only need non-NA

    fp = FontProperties()
    fp.set_family("monospace")

    # create N x 6 x 4 matrix for RBS
    mat = create_numpy_for_column(df, col)
    mat = mat.reshape((mat.shape[0], mat.shape[1] * mat.shape[2]))

    # get interesting features to view data by
    gc = df["GC"]
    group = df["GENOME_TYPE"]

    for r in range(1):

        reducer = umap.UMAP(random_state=r)
        reducer = reducer.fit(mat)
        embedding = reducer.embedding_
        print(embedding.shape)

        # fig, ax = plt.subplots()
        #
        # plt.scatter(embedding[:, 0], embedding[:, 1], c=gc, marker="+")
        # plt.colorbar()
        # plt.show()
        # themes = ["fire", "viridis", "inferno", "blue", "red", "green", "darkblue", "darkred", "darkgreen"]
        # fig, axes = plt.subplots(3, 3)
        # for ax, theme in zip(axes.ravel(), themes):
        #     fig, ax = plt.subplots()
        #     umap.plot.points(reducer, values=gc, theme=theme, )
        #     plt.show()
        ax = umap.plot.points(reducer, values=gc, cmap="viridis")
        mappable = create_mappable_for_colorbar(gc, "viridis")
        plt.colorbar(mappable)
        plt.title(col)
        plt.tight_layout()
        save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
        plt.show()

        umap.plot.points(reducer, labels=group.values, color_key_cmap="Paired")
        plt.title(col)
        plt.tight_layout()
        save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
        plt.show()

        # umap.plot.points(reducer, labels=group.values, color_key_cmap="Dark2")
        # plt.title(col)
        # save_figure(FigureOptions(
        #     save_fig=next_name(env["pd-work"])
        # ))
        # plt.show()

        umap.plot.points(reducer, labels=df["Type"])
        plt.title(col)
        plt.tight_layout()
        save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
        plt.show()
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df = pd.read_csv(args.pf_data)
    df["chunk-size"] /= 1000

    import matplotlib.pyplot as plt

    fig, ax = plt.subplots()



    sns.lineplot(df[df["Tool"] == "SBSP"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                 hue="Genome",
                 sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "dashed"},
                 ax=ax,
                 legend=False,
                 figure_options=FigureOptions(
                     xlabel="Chunk size (mb)",
                     ylabel="Accuracy",
                     ylim=[74, 101],
                     save_fig=next_name(env["pd-work"])
                 ))

    for l in ax.lines:
        l.set_linestyle("--")

    sns.lineplot(df[df["Tool"] == "GMS2"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                 hue="Genome",
                 sns_kwargs={"palette": CM.get_map("verified")},
                 legend_loc="best",
                 legend_ncol=2,
                 ax=ax)




    if args.with_mgm:
        y_max = ax.get_ylim()[1]
        ax.axvline(50, 0, y_max, color="grey", linestyle="dashed")
        ax.axhline(74, 5, 49, color="grey", linestyle="dashed")
        ax.annotate("MGM", (5, 72))

    if "MGM" in set(df["Tool"]):
        sns.lineplot(df[df["Tool"] == "MGM"], "chunk-size", "percentage-common-3prime-and-5prime-from-common-3prime",
                     hue="Genome",
                     sns_kwargs={"palette": CM.get_map("verified"), "linestyle": "-."},
                     ax=ax,
                     legend=False)

    for l in ax.lines[len(ax.lines)-5:]:
        l.set_linestyle(":")

    fo = FigureOptions(
                     xlabel="Chunk size (mb)",
                     ylabel="Accuracy",
                     ylim=[74,101],
                     save_fig=next_name(env["pd-work"])
                 )
    FigureOptions.set_properties_for_axis(ax, fo)
    plt.savefig(fo.save_fig)
    plt.show()
def analyze_upstream_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = os_join(env["pd-work"], "upstream_distances")
    mkdir_p(pd_work)

    # remove empty lists
    df = df[df["Upstream-distance"] != "[]"].copy()
    df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval)
    df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent)

    # compute consistencies with different flexibilities
    for flexibility in {0, 3}:
        df["PC(x,{})".format(flexibility)] = df[[
            "Most frequent upstream", "Upstream-distance"
        ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[
            "Most frequent upstream"], flexibility),
                 axis=1)

    df = df[df["Support"] > 10].copy()

    # for mf in range(-20, 50):
    #     df_mf = df[df["Most frequent upstream"] == mf]
    #     if len(df_mf) < 50:
    #         continue
    #
    #     sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 0),
    #         save_fig=next_name(pd_work),
    #         xlim=(0,1)
    #     ))
    #     sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 3),
    #         save_fig=next_name(pd_work),
    #         xlim=(0, 1)
    #     ))

    # plot distribution of Average PC
    import seaborn
    import matplotlib.pyplot as plt

    df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) &
                (df["Most frequent upstream"] > -50)]
    # NCBI consistency as a func
    df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) &
            (df["Most frequent upstream"] < 100) &
            (df["Most frequent upstream"] > -50)]

    df_tmp = stack_columns_as_rows(
        df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)",
                "Ancestor"]], ["PC(x,0)", "PC(x,3)"],
        "PC(x,f)",
        None,
        label_col="Flexibility")
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #             hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility")
    # plt.show()

    sns.lmplot(df_tmp,
               "Most frequent upstream",
               "PC(x,f)",
               hue="Flexibility",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    sns.distplot(df,
                 "Most frequent upstream",
                 figure_options=FigureOptions(save_fig=next_name(pd_work)),
                 sns_kwargs={"kde": True})

    import seaborn
    # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor")
    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename(
         'Percentage (by clade)').reset_index().pipe(
             (seaborn.catplot, 'data'),
             x="Most frequent upstream",
             y='Percentage (by clade)',
             hue="Ancestor",
             kind='point',
             scale=0.5,
             legend=False,
             palette=CM.get_map("ancestor"),
             aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Percent of components (by clade)")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts().rename(
         'number').reset_index().pipe((seaborn.catplot, 'data'),
                                      x="Most frequent upstream",
                                      y='number',
                                      hue="Ancestor",
                                      kind='point',
                                      scale=0.5,
                                      legend=False,
                                      palette=CM.get_map("ancestor"),
                                      aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Number of components")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    f, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    for ancestor, df_group in df.groupby("Ancestor"):
        seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1)

        # ax2.set_ylim(0, 3)
        ax2.yaxis.set_ticks([])
        seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2)
        ax1.set_xlabel('x var')
        ax1.set_ylabel('Counts')
    # g = seaborn.FacetGrid(df, hue="Ancestor")
    # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True)
    plt.show()

    print(df["Most frequent upstream"].value_counts(normalize=True))

    sns.lmplot(
        df,
        "Most frequent upstream",
        "PC(x,0)",
        hue="Ancestor",
        sns_kwargs={
            "scatter": False,
            "lowess": True,
            "palette": CM.get_map("ancestor")
        },
        figure_options=FigureOptions(save_fig=next_name(pd_work),
                                     xlim=[-7, None],
                                     ylim=[0, 1]),
    )

    sns.lmplot(df,
               "Most frequent upstream",
               "PC(x,3)",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    # NCBI sensitivity
    # collect:
    # average 5' per ancestor, r,

    ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)]
    list_collect = list()
    for r in ranges:

        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])

        df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter])
        # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r))

        df_summary_per_gcfid = df_summary_per_gcfid.groupby(
            "Ancestor", as_index=False).mean()
        df_summary_per_gcfid["Range"] = str(r)
        list_collect.append(df_summary_per_gcfid)

    df_tmp = pd.concat(list_collect, sort=False)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # do not average per gcfid - average per ancestor
    list_collect = list()

    range_avgs = list()
    range_label = list()

    for r in ranges:
        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])
        df_r = df[r_filter]

        for ancestor, df_group in df_r.groupby(
                "Ancestor", as_index=False):  # type: str, pd.DataFrame

            f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & (
                df_group["NCBI"])
            f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & (
                df_group["(GMS2=SBSP)!=NCBI"])

            sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float(
                f_gms2_eq_sbsp_with_ncbi_pred.sum())
            list_collect.append({
                "Ancestor":
                ancestor,
                "Range":
                str(r),
                "range_avg": (r[1] + r[0]) / 2.0,
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP":
                sensitivity,
                "GMS2=SBSP":
                f_gms2_eq_sbsp_with_ncbi_pred.sum()
            })

        range_label.append(r)
        range_avgs.append((r[1] + r[0]) / 2.0)

    df_tmp = pd.DataFrame(list_collect)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    ancestors = list(set(df_tmp["Ancestor"]))
    fig, axes = plt.subplots(
        len(ancestors),
        1,
        sharex="all",
    )
    for ancestor, ax in zip(ancestors, axes.ravel()):  # type: str, plt.Axes
        ax2 = ax.twinx()
        curr_df = df_tmp[df_tmp["Ancestor"] == ancestor]
        seaborn.lineplot("range_avg",
                         "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                         data=curr_df,
                         ax=ax)
        seaborn.lineplot("range_avg",
                         "GMS2=SBSP",
                         data=curr_df,
                         color='r',
                         legend=False,
                         ax=ax2)
        ax.set_ylabel(None)
        ax2.set_ylabel(None)
        ax.set_xlabel("Range Average")

    plt.xticks(range_avgs, range_label)
    plt.show()

    fig, ax = plt.subplots()
    ax2 = ax.twinx()
    seaborn.lineplot("range_avg",
                     "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                     data=df_tmp,
                     ax=ax,
                     color="b",
                     ci=None,
                     hue="Ancestor")
    seaborn.lineplot("range_avg",
                     "GMS2=SBSP",
                     data=df_tmp,
                     ci=None,
                     color='r',
                     legend=False,
                     ax=ax2,
                     hue="Ancestor")
    # plt.xticks(range_avgs, range_label)
    ax.set_ylim([0, None])
    ax2.set_ylim([0, None])

    ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP")
    ax2.set_ylabel("Number of GMS2=SBSP genes")
    ax.set_xlabel("Range Average")

    ax.yaxis.label.set_color('b')
    ax2.yaxis.label.set_color('r')
    ax.set_xlabel("Distance to upstream gene (nt)")
    plt.show()

    # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work)
    #
    # for ancestor, df_group in df.groupby("Ancestor", as_index=False):
    #     sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor)
    #     sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor)

    a = 0
示例#29
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df_bac = load_obj(args.pf_data).reset_index()  # type: pd.DataFrame
    df_bac = df_bac[df_bac["GENOME_TYPE"].isin(args.group)]
    min_gc = 20
    max_gc = 70

    if args.motif_type == "PROMOTER":
        df_bac = df_bac[df_bac["GC"] >= 40].copy()

    gc_values = np.arange(min_gc, max_gc, 2)
    models = get_models_by_gc(df_bac, gc_values, motif_type=args.motif_type)

    num_plots = len(models)
    num_rows = int(math.sqrt(num_plots))
    num_cols = math.ceil(num_plots / float(num_rows))

    fig, axes = plt.subplots(num_rows,
                             num_cols,
                             sharex="all",
                             sharey="all",
                             figsize=(12, 10))

    model_index = 0
    for r in range(num_rows):
        for c in range(num_cols):
            if model_index >= len(models):
                break

            if models[model_index] is None:
                model_index += 1
                continue

            bgd = [0.25] * 4
            bgd = background_from_gc(gc_values[model_index])

            newmod = lm.transform_matrix(models[model_index][0],
                                         to_type="information",
                                         from_type="probability",
                                         background=models[model_index][1])
            # from copy import copy
            # newmod = copy(models[model_index][0])
            # for idx in newmod.index:
            #     # see https://bioconductor.org/packages/release/bioc/vignettes/universalmotif/inst/doc/IntroductionToSequenceMotifs.pdf
            #
            #     uncertainty = sum(
            #         [newmod.at[idx, l] * math.log2(newmod.at[idx, l]) for l in newmod.columns]
            #     )
            #     fIC = math.log2(4) - uncertainty
            #     for i, l in enumerate(sorted(newmod.columns)):
            #         newmod.at[idx, l] = max(1 * newmod.at[idx, l] * math.log2(newmod.at[idx, l] / models[model_index][1][i]), 0)
            lm.Logo(newmod, ax=axes[r][c])

            axes[r][c].set_ylim(0, 2)
            axes[r][c].set_title(int(gc_values[model_index]))
            # fig.show()
            model_index += 1

    plt.tight_layout()
    plt.savefig(next_name(env["pd-work"]))
    plt.show()
示例#30
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    pd_figures = os_join(env["pd-work"], "figures")
    mkdir_p(pd_figures)


    list_run_info = list()

    for gi in tqdm(gil, total=len(gil)):
        # get gms2 and toolp models
        mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi)

        group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper()


        mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None)
        mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None)
        non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"])

        df_gms2 = mm_gms2.pwm_to_df()
        df_toolp = mm_toolp.pwm_to_df()

        fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4))

        # relative
        rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information")
        lm.Logo(rel_mat, color_scheme="classic", ax=axes[0])
        axes[0].set_ylim(*[0, 2])
        axes[0].set_title("GeneMarkS-2")

        # shannon
        sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information")
        lm.Logo(sha_mat, color_scheme="classic", ax=axes[1])
        axes[1].set_ylim(*[0, 2])
        axes[1].set_title("StartLink+")
        plt.tight_layout()
        plt.savefig(next_name(pd_figures))
        plt.show()

        rel_gms2 = relative_entropy(mm_gms2, non_gms2)
        rel_toolp = relative_entropy(mm_toolp, non_gms2)
        gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta"))

        if not args.verified:
            list_run_info.append({
                "GC": gc,
                "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi),
                "RE GMS2": rel_gms2,
                "RE toolp": rel_toolp
            })
        else:
            # verified
            comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group)
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[0],
                "Tool": "GMS2",
                "RE": rel_gms2,
                "GC": gc
                })
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[1],
                "Tool": "GMS2 with SL",
                "RE": rel_toolp,
                "GC": gc
                })

            print(list_run_info[-2:])

    import sbsp_viz.sns as sns
    if args.verified:
        df = pd.DataFrame(list_run_info)
        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"]),
            xlabel="Genome",
            ylabel="Error"))

        sns.lineplot(df, "Genome", "RE", hue="Tool",
                        figure_options=FigureOptions(
                            save_fig=next_name(env["pd-work"]),
                            xlabel="Genome",
                            ylabel="Relative entropy",
                        ))


    else:

        df = pd.DataFrame(list_run_info)
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))


        print("Average Error: {}".format(df["Accuracy"].mean()))

        df = pd.DataFrame(list_run_info)
        df = df[df["Accuracy"] < 2].copy()
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))

        print("Average Error: {}".format(df["Accuracy"].mean()))

        df.to_csv(next_name(env["pd-work"], ext="csv"))