示例#1
0
def scatterplot(df, x, y, hue=None, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None

    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    ax = get_value(kwargs, "ax", None)

    identity = get_value(kwargs, "identity", False)

    if not ax:
        _, ax = plt.subplots()

    g = sns.scatterplot(x=x, y=y, hue=hue, data=df, linewidth=0, **sns_kwargs)

    if identity:
        add_identity(ax, color="r", ls="--")

    FigureOptions.set_properties_for_axis(ax, figure_options)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)
    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title)
        else:
            plt.legend(loc=legend_loc)


    save_figure(figure_options)
    plt.show()
def venn_diagram_5prime(labels_a, labels_b, labels_c, figure_options=None):
    # type: (Labels, Labels, Labels, FigureOptions) -> None

    # first, reduce each set to common genes
    list_labels_common_3prime = reduce_labels_to_genes_in_all(
        [labels_a, labels_b, labels_c])
    label_value_pair = numbers_for_3d_venn(*list_labels_common_3prime)

    fig, ax = plt.subplots()

    # venn3([set(get_set_gene_keys(labels)) for labels in list_labels_common_3prime],
    #       set_labels=[labels.name for labels in list_labels_common_3prime])

    # create equal sized circles
    v = venn3([1, 1, 1, 1, 1, 1, 1],
              set_labels=[labels.name for labels in list_labels_common_3prime])

    for key, value in label_value_pair.items():
        v.get_label_by_id(key).set_text(value)

    # Add title and annotation
    FigureOptions.set_properties_for_axis(ax, figure_options)

    if figure_options is not None and figure_options.save_fig is not None:
        plt.savefig(figure_options.save_fig, bbox_inches='tight')

    # Show it
    plt.show()
示例#3
0
def catplot(df, x, y, hue=None, kind="box", figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    g = sns.catplot(x=x, y=y, data=df, kind=kind, hue=hue, legend=False, aspect=1.5, **sns_kwargs)

    if kind == "point":
        plt.setp(g.ax.lines, linewidth=1)  # set lw for all lines of g axes
        # plt.setp(g.ax.lines, markersize=0)  # set lw for all lines of g axes
    #
    # if fontsize:
    #     g.set_xlabels(x, fontsize=fontsize)
    #     g.set_ylabels(x, fontsize=fontsize)

    FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)




    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title)
        else:
            plt.legend(loc=legend_loc)

    # plt.savefig(next_name(pd_work))
    save_figure(figure_options)
    plt.show()
示例#4
0
def analyze_kimura_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env["pd-work"]

    df = df[df["Kimura-to-query"] != "[]"].copy()
    df["Kimura-to-query"] = df["Kimura-to-query"].apply(ast.literal_eval)
    df["Average-Kimura"] = df["Kimura-to-query"].apply(np.mean)
    df["Std-Kimura"] = df["Kimura-to-query"].apply(np.std)

    sns.lmplot(df,
               "Genome GC",
               "Average-Kimura",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   },
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work)))
    df_mean = df.groupby(["Ancestor", "GCFID"], as_index=False).mean()

    sns.lmplot(df_mean,
               "Genome GC",
               "Average-Kimura",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": True,
                   "lowess": True,
                   "scatter_kws": {
                       "s": 5
                   },
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work)))

    # Min/max kimura
    df["Min-Kimura"] = df["Kimura-to-query"].apply(min)
    df["Max-Kimura"] = df["Kimura-to-query"].apply(max)

    contour_kimura_per_ancestor(env, df)
    one_dim_Kimura_accuracy(env, df)

    kimura_dist_plot(env, df)
    heat_map_Kimura_accuracy(env,
                             df,
                             "Min-Kimura",
                             "Max-Kimura",
                             balance=True,
                             xlabel="Minimum Kimura",
                             ylabel="Maximum Kimura")
    heat_map_Kimura_accuracy(env,
                             df,
                             "Average-Kimura",
                             "Std-Kimura",
                             balance=False)
示例#5
0
def plot_catplot(df, column_x, column_y, figure_options=None):
    _, ax = plt.subplots()
    sns.catplot(x=column_x, y=column_y, kind="bar", data=df)

    FigureOptions.set_properties_for_axis(ax, figure_options)
    if figure_options is not None and figure_options.save_fig is not None:
        plt.savefig(figure_options.save_fig, bbox_index="tight")

    plt.show()
示例#6
0
def distplot(df, x, figure_options=None, **kwargs):
    _, ax = plt.subplots()

    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    if "kde" not in sns_kwargs:
        sns_kwargs["kde"] = True

    g = sns.distplot(df[x], bins=50, **sns_kwargs)

    FigureOptions.set_properties_for_axis(g.axes, figure_options)
    save_figure(figure_options)
    plt.show()
示例#7
0
def barplot(df, x, y, hue, figure_options=None, **kwargs):
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    ax = get_value(kwargs, "ax", None)

    g = sns.barplot(x=x, y=y, data=df, hue=hue,  ax=ax, **sns_kwargs)

    if hue is not None:
        plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))

    FigureOptions.set_properties_for_axis(g, figure_options)
    plt.tight_layout()
    save_figure(figure_options)
    # plt.tight_layout(rect=[-0.3,0,1,1.2])
    plt.show()
示例#8
0
def kdeplot(df, x, y, hue=None, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())

    _, ax = plt.subplots()
    y_df = None if y is None else df[y]

    g = sns.kdeplot(df[x], y_df, legend=False, **sns_kwargs)

    if hue is not None:
        plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))

    FigureOptions.set_properties_for_axis(ax, figure_options)
    save_figure(figure_options)
    plt.show()
示例#9
0
def plot_scatter_for_columns_from_files(env,
                                        pf_data,
                                        column_names,
                                        delimiter=",",
                                        **kwargs):
    # type: (Environment, str, list[str], str, **str) -> None

    filter_by_equal = get_value(kwargs, "filter_by_equal", None)
    scatter_separately = get_value(kwargs, "scatter_in_separate_files", False)
    limit_x_axis_features = get_value(kwargs, "limit_x_axis_features", None)
    color_by_value = get_value(kwargs, "color_by_value", None)

    title = get_value(kwargs, "title", None)
    df = pd.read_csv(pf_data, delimiter=delimiter)

    if filter_by_equal is not None:
        filter_column_name, value = filter_by_equal
        df = filter_dataframe_by_equal(df, filter_column_name, value)

    if scatter_separately:

        x_axis_column_names = column_names
        if limit_x_axis_features is not None:
            x_axis_column_names = limit_x_axis_features

        for f1 in x_axis_column_names:
            for f2 in column_names:
                plot_scatter_for_dataframe_columns(
                    df, [f1, f2],
                    color_by_value=color_by_value,
                    figure_options=FigureOptions(
                        title=title,
                        save_fig=os.path.join(env["pd-work-results"],
                                              "scatter_{}_{}".format(f1, f2))))
    else:
        if color_by_value is not None:
            plot_scatter_matrix(
                df,
                column_names,
                color_by=color_by_value,
                figure_options=FigureOptions(save_fig=os.path.join(
                    env["pd-work-results"], "scatter.pdf")))
        else:
            plot_scatter_matrix_for_dataframe_columns(
                df,
                column_names,
                figure_options=FigureOptions(save_fig=os.path.join(
                    env["pd-work-results"], "scatter.pdf")))
示例#10
0
def analyze_gms2_components_on_verified_set(env, gil):
    # type: (Environment, GenomeInfoList) -> None

    # run different components
    list_df = list()
    for gi in gil:
        list_df.append(
            analyze_gms2_components_on_verified_set_for_gi(env, gi)
        )

    df = pd.concat(list_df, ignore_index=True, sort=False)
    df["Genome"] = df.apply(fix_names, axis=1)
    print(df.to_csv())


    fig, ax = plt.subplots(figsize=(12,4))
    sns.barplot(df, "Genome", "Error", hue="Component",
                ax=ax,
                figure_options=FigureOptions(
                    save_fig=next_name(env["pd-work"])
                ),
                sns_kwargs={
                    "hue_order": reversed(["GMS2", "MGM2*", "Start Context", "RBS", "Start Codons", "Promoter", "MGM"]),
                    "palette": CM.get_map("gms2_components")

                })
def kimura_dist_plot(env, df):
    import seaborn
    import matplotlib.pyplot as plt

    ancestors = list(set(df["Ancestor"]))
    # fig, axes = plt.subplots(2, math.ceil(len(ancestors)/2), sharex=True, sharey=True)
    #
    # for anc, ax in zip(ancestors, axes.ravel()):
    #
    #     df_group = df[df["Ancestor"] == anc]
    #     seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc],
    #                      hist=False)
    #     ax.set_title(anc)
    # plt.show()

    fig, ax = plt.subplots()  # type: plt.Figure, plt.Axes
    for anc in ancestors:
        df_group = df[df["Ancestor"] == anc]
        seaborn.distplot(df_group["Average-Kimura"],
                         ax=ax,
                         color=CM.get_map("ancestor")[anc],
                         hist=False,
                         label=anc)
        # ax.set_title(anc)

    ax.legend(ancestors)
    ax.set_ylabel("PDF")
    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
    plt.show()
示例#12
0
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs):
    # type: (Environment, str, str, str, Dict[str, Any]) -> None

    venn_title = get_value(kwargs, "venn_title", None)
    pf_venn = get_value(kwargs, "pf_venn",
                        os.path.join(env["pd-work"], "venn.pdf"))

    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2")
    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP")
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI")

    lcd = LabelsComparisonDetailed(labels_gms2,
                                   labels_sbsp,
                                   name_a="gms2",
                                   name_b="sbsp")

    labels_gms2_sbsp_3p_5p = lcd.intersection("a")

    lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                     labels_ncbi,
                                     name_a="gms2_sbsp",
                                     name_b="ncbi")

    labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a")

    out = "gms2,sbsp,ncbi,gms2_sbsp,gms2_sbsp_ncbi"
    out += "\n{},{},{},{},{}".format(len(labels_gms2), len(labels_sbsp),
                                     len(labels_ncbi),
                                     len(labels_gms2_sbsp_3p_5p),
                                     len(labels_gms2_sbsp_ncbi_3p_5p))

    print(out)

    venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi,
                        FigureOptions(title=venn_title, save_fig=pf_venn))
def logo_rbs_from_gms2_mod_file(pd_figures, pf_mod, title=""):
    # type: (str, str, str) -> None

    mod = GMS2Mod.init_from_file(pf_mod)
    mm = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"])
    non = GMS2Noncoding(mod.items["NON_MAT"])
    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(1, 2)
    import logomaker as lm
    lm.Logo(lm.transform_matrix(mm.pwm_to_df(),
                                from_type="probability",
                                to_type="information",
                                background=non.pwm_to_array(0)),
            ax=axes[0])
    axes[0].set_title(title)
    axes[0].set_ylim(0, 2)

    df_spacer = pd.DataFrame({
        "Distance from start": range(len(mm._spacer)),
        "Probability": mm._spacer
    })
    sns.lineplot(df_spacer,
                 "Distance from start",
                 "Probability",
                 ax=axes[1],
                 figure_options=FigureOptions(ylim=[0, 0.4]))
    plt.tight_layout()
    plt.savefig(next_name(pd_figures))

    plt.show()
示例#14
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df = pd.read_csv(args.pf_stats)

    compute_more(df)
    fo = FigureOptions(ylim=[0, 700000])

    viz_per_genome(env, df)
def analyze_by_step_group(df, pd_work, fn_prefix, tag):
    # type: (pd.DataFrame, str, str, str) -> None

    list_df = list()
    for index in df.index:
        curr_df = pd.DataFrame(df.at[index, "by_step_group_{}".format(tag)])
        curr_df["Genome"] = df.at[index, "Genome"]

        if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}:
            continue

        list_df.append(curr_df)

    df_acc = pd.concat(list_df)

    sns.catplot(
        df_acc,
        "Step Group",
        "Percentage 3p match: Verified from {}".format(tag),
        hue="Genome",
        kind="point",
        figure_options=FigureOptions(
            title="Percentage 3p match versus minimum support",
            ylabel="Percentage of 3p match",
            save_fig=next_name(pd_work),
            ylim=[None, 100.5]),
    )

    sns.catplot(
        df_acc,
        "Step Group",
        "Percentage 5p-3p match: Verified from {}".format(tag),
        kind="point",
        hue="Genome",
        figure_options=FigureOptions(
            title="Percentage 5p-3p match versus minimum support",
            ylabel="Percentage of 5p-3p match",
            save_fig=next_name(pd_work),
            ylim=[90, 100.5]),
    )

    print(df_acc.to_string())
示例#16
0
def scatter(df, column_x, column_y, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, FigureOptions, Dict[str, Any]) -> None

    column_z = get_value(kwargs, "column_z", None)
    identity = get_value(kwargs, "identity", False)
    hue = df[column_z] if column_z is not None else None

    _, ax = plt.subplots()


    sns.jointplot(df[column_x], df[column_y], kind="scatter", alpha=0.3, s=10, linewidth=0)
    #sns.scatterplot(df[column_x], df[column_y], hue=hue, alpha=0.3, s=10, linewidth=0)
    if identity:
        add_identity(ax, color="r", ls="--")

    FigureOptions.set_properties_for_axis(ax, figure_options)
    if figure_options is not None and figure_options.save_fig is not None:
        plt.savefig(figure_options.save_fig, bbox_index="tight")

    plt.show()
示例#17
0
def df_plot_scatter_matrix(env, df, column_names, **kwargs):
    # type: (Environment, pd.DataFrame, Union[List, Set], Dict[str, Any]) -> None

    color_by_value = get_value(kwargs, "color_by_value", None)

    if color_by_value is not None:
        plot_scatter_matrix(
            df,
            column_names,
            color_by=color_by_value,
            figure_options=FigureOptions(
                save_fig=os.path.join(env["pd-work-results"], "scatter.pdf")),
            **kwargs)
    else:

        plot_scatter_matrix(
            df,
            column_names,
            color_by=color_by_value,
            figure_options=FigureOptions(
                save_fig=os.path.join(env["pd-work-results"], "scatter.pdf")),
            **kwargs)
示例#18
0
def plot_hist_by_group(df_data,
                       column_x,
                       column_group=None,
                       figure_options=None,
                       **kwargs):
    # type: (pd.DataFrame, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None

    bins = get_value(kwargs, "bins", 10)

    _, ax = plt.subplots()

    cumulative = get_value(kwargs, "cumulative", False)
    shade = False if cumulative else True
    cut = [min(df_data[column_x]), max(df_data[column_x])]
    if column_group is not None:
        for name, df_group in df_data.groupby(column_group):
            sns.distplot(df_group[column_x],
                         hist=False,
                         kde_kws={
                             "shade": shade,
                             "cumulative": cumulative
                         },
                         label=name)
    else:
        # sns.distplot(df_data[column_x], hist=True, kde_kws={"shade": shade, "cumulative": cumulative, "clip": cut})
        sns.distplot(df_data[column_x],
                     bins=bins,
                     hist=True,
                     kde=False,
                     hist_kws={"edgecolor": "black"})

    FigureOptions.set_properties_for_axis(ax, figure_options)

    # plt.xlim([min(df_data[column_x]), max(df_data[column_x])])
    if figure_options is not None and figure_options.save_fig is not None:
        plt.savefig(figure_options.save_fig, bbox_index="tight")

    plt.show()
示例#19
0
def lmplot(df, x, y, hue=None, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None

    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    if "aspect" not in sns_kwargs:
        sns_kwargs["aspect"] = 2

    g = sns.lmplot(x=x, y=y, hue=hue, data=df, legend=False, **sns_kwargs)

    FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)
    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            g.axes[0][0].legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title)
        else:
            g.axes[0][0].legend(loc=legend_loc)

    save_figure(figure_options, fig=g.fig)
    plt.subplots_adjust(right=1)
    plt.show()
    return g
def analyze_by_support(df, pd_work, fn_prefix, tag):
    # type: (pd.DataFrame, str, str, str) -> None

    list_df = list()
    for index in df.index:
        curr_df = pd.DataFrame(df.at[index, "by_support_{}".format(tag)])
        curr_df["Genome"] = df.at[index, "Genome"]

        if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}:
            continue

        list_df.append(curr_df)

    df_acc = pd.concat(list_df)

    sns.lineplot(
        df_acc,
        "Min Support",
        "Percentage 3p match: Verified from {}".format(tag),
        hue="Genome",
        figure_options=FigureOptions(
            title="Percentage of verified genes predicted\nby {}".format(tag),
            ylabel="Percentage",
            save_fig=next_name(pd_work),
            ylim=[None, 100.5]))

    sns.lineplot(
        df_acc,
        "Min Support",
        "Percentage 5p-3p match: Verified from {}".format(tag),
        hue="Genome",
        figure_options=FigureOptions(
            title="Percentage of predicted {} genes\nwith correct 5' end".
            format(tag),
            ylabel="Percentage of 5p-3p match",
            save_fig=next_name(pd_work),
            ylim=[90, 100.5]))
示例#21
0
def lineplot(df, x, y, hue=None, figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None

    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    ax = get_value(kwargs, "ax", None)
    show = get_value(kwargs, "show", ax is None)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)
    legend_ncol = get_value(kwargs, "legend_ncol", 1)

    identity = get_value(kwargs, "identity", False)

    if not ax:
        fig, ax = plt.subplots()
    else:
        fig = ax.get_figure()

    g = sns.lineplot(x=x, y=y, hue=hue, data=df, ax=ax, legend=legend, **sns_kwargs)

    if identity:
        add_identity(ax, color="r", ls="--")

    FigureOptions.set_properties_for_axis(ax, figure_options)
    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title, ncol=legend_ncol)
        else:
            plt.legend(loc=legend_loc, ncol=legend_ncol, title=title)
        if title is not None and len(title)  == 0:
            handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles=handles[1:], labels=labels[1:], ncol=legend_ncol)

    if show:
        save_figure(figure_options, fig)
        plt.show()
示例#22
0
    def _histogram_multiple_stats_summary_by_attribute(self, list_df,
                                                       pd_output):
        # type: (List[Tuple[str, pd.DataFrame]], str) -> None

        # merge df and add value
        df = pd.DataFrame()

        for item in list_df:
            value, curr_df = item
            curr_df["step"] = value
            df = df.append(curr_df, ignore_index=True)

        plot_catplot(
            df, "step", "% Common 5'",
            FigureOptions(save_fig=os.path.join(pd_output, "histogram.pdf")))
def contour_kimura_per_ancestor(env, df):
    import seaborn
    import matplotlib.pyplot as plt

    ancestors = sorted(list(set(df["Ancestor"])))
    fig, axes = plt.subplots(2,
                             math.ceil(len(ancestors) / 2),
                             sharex=True,
                             sharey=True,
                             figsize=(6, 6))

    for anc, ax in zip(ancestors, axes.ravel()):

        df_group = df[df["Ancestor"] == anc]
        seaborn.kdeplot(df_group["Min-Kimura"].values,
                        df_group["Max-Kimura"].values,
                        ax=ax)
        ax.set_title(anc)
        # ax.set_ylim([0.45, 0.525])

    # fig.xlabel("Min-Kimura")
    # plt.xlabel("Min-Kimura")
    # plt.ylabel("Max-Kimura")
    # fig.text(0.5, 0.04, 'Min-Kimura', ha='center')
    # fig.text(0.04, 0.5, 'Max-Kimura', va='center', rotation='vertical')
    fig.add_subplot(111, frameon=False)
    # # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel("Minimum Kimura", labelpad=20)
    plt.ylabel("Maximum Kimura", labelpad=30)

    fig.tight_layout()
    save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))

    plt.show()
示例#24
0
def viz_summary_per_gcfid_per_step(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = env['pd-work']

    list_df = list()
    for step in ["A", "B", "C"]:
        df_summary_per_gcfid = get_summary_per_gcfid(
            df[df["Predicted-at-step"] == step])
        df_summary_per_gcfid["SBSP Step"] = step
        list_df.append(df_summary_per_gcfid)

    df_per_gcfid_per_step = pd.concat(list_df, sort=False)

    sns.catplot(df_per_gcfid_per_step,
                "Ancestor",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="SBSP Step",
                kind="box",
                legend_loc="best",
                figure_options=FigureOptions(save_fig=next_name(pd_work),
                                             xlabel="Clade",
                                             ylabel="Err(NCBI,GMS2=SBSP)"))
示例#25
0
def plot_histograms_for_columns(env, df_data, column_names, **kwargs):
    # type: (Environment, pd.DataFrame, List[str], Dict[str, Any]) -> None

    group_by = get_value(kwargs, "group_by", None)

    title_name = get_value(kwargs, "title_name", "", default_if_none=True)
    xlim = get_value(kwargs, "xlim", None)

    for c in column_names:
        plot_hist_by_group(df_data,
                           c,
                           group_by,
                           figure_options=FigureOptions(
                               xlabel=c,
                               title="{}".format(title_name),
                               ylabel="Frequency",
                               save_fig=os.path.join(
                                   env["pd-work"], "hist{}.pdf".format(
                                       c.replace(" ",
                                                 "_").replace("(", "").replace(
                                                     ")", ""))),
                               xlim=xlim),
                           bins=get_value(kwargs, "bins", 10))
示例#26
0
def df_plot_scatter_separate(env, df, column_pairs, **kwargs):
    # type: (Environment, pd.DataFrame, List[List], Dict[str, Any]) -> None

    color_by_value = get_value(kwargs, "color_by_value", None)
    limit_x_axis_features = get_value(kwargs, "limit_x_axis_features", None)
    jitter = get_value(kwargs, "jitter", None)
    title = get_value(kwargs, "title", None)

    if limit_x_axis_features is not None:
        column_pairs = [
            x for x in column_pairs if x[0] in limit_x_axis_features
        ]

    for f1, f2 in column_pairs:

        plot_scatter_for_dataframe_columns(
            df, [f1, f2],
            color_by_value=color_by_value,
            figure_options=FigureOptions(save_fig=os.path.join(
                env["pd-work-results"], "scatter_{}_{}.pdf".format(f1, f2)),
                                         xlabel=f1,
                                         ylabel=f2,
                                         title=title))
def analyze_upstream_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = os_join(env["pd-work"], "upstream_distances")
    mkdir_p(pd_work)

    # remove empty lists
    df = df[df["Upstream-distance"] != "[]"].copy()
    df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval)
    df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent)

    # compute consistencies with different flexibilities
    for flexibility in {0, 3}:
        df["PC(x,{})".format(flexibility)] = df[[
            "Most frequent upstream", "Upstream-distance"
        ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[
            "Most frequent upstream"], flexibility),
                 axis=1)

    df = df[df["Support"] > 10].copy()

    # for mf in range(-20, 50):
    #     df_mf = df[df["Most frequent upstream"] == mf]
    #     if len(df_mf) < 50:
    #         continue
    #
    #     sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 0),
    #         save_fig=next_name(pd_work),
    #         xlim=(0,1)
    #     ))
    #     sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 3),
    #         save_fig=next_name(pd_work),
    #         xlim=(0, 1)
    #     ))

    # plot distribution of Average PC
    import seaborn
    import matplotlib.pyplot as plt

    df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) &
                (df["Most frequent upstream"] > -50)]
    # NCBI consistency as a func
    df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) &
            (df["Most frequent upstream"] < 100) &
            (df["Most frequent upstream"] > -50)]

    df_tmp = stack_columns_as_rows(
        df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)",
                "Ancestor"]], ["PC(x,0)", "PC(x,3)"],
        "PC(x,f)",
        None,
        label_col="Flexibility")
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #             hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility")
    # plt.show()

    sns.lmplot(df_tmp,
               "Most frequent upstream",
               "PC(x,f)",
               hue="Flexibility",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    sns.distplot(df,
                 "Most frequent upstream",
                 figure_options=FigureOptions(save_fig=next_name(pd_work)),
                 sns_kwargs={"kde": True})

    import seaborn
    # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor")
    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename(
         'Percentage (by clade)').reset_index().pipe(
             (seaborn.catplot, 'data'),
             x="Most frequent upstream",
             y='Percentage (by clade)',
             hue="Ancestor",
             kind='point',
             scale=0.5,
             legend=False,
             palette=CM.get_map("ancestor"),
             aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Percent of components (by clade)")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts().rename(
         'number').reset_index().pipe((seaborn.catplot, 'data'),
                                      x="Most frequent upstream",
                                      y='number',
                                      hue="Ancestor",
                                      kind='point',
                                      scale=0.5,
                                      legend=False,
                                      palette=CM.get_map("ancestor"),
                                      aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Number of components")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    f, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    for ancestor, df_group in df.groupby("Ancestor"):
        seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1)

        # ax2.set_ylim(0, 3)
        ax2.yaxis.set_ticks([])
        seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2)
        ax1.set_xlabel('x var')
        ax1.set_ylabel('Counts')
    # g = seaborn.FacetGrid(df, hue="Ancestor")
    # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True)
    plt.show()

    print(df["Most frequent upstream"].value_counts(normalize=True))

    sns.lmplot(
        df,
        "Most frequent upstream",
        "PC(x,0)",
        hue="Ancestor",
        sns_kwargs={
            "scatter": False,
            "lowess": True,
            "palette": CM.get_map("ancestor")
        },
        figure_options=FigureOptions(save_fig=next_name(pd_work),
                                     xlim=[-7, None],
                                     ylim=[0, 1]),
    )

    sns.lmplot(df,
               "Most frequent upstream",
               "PC(x,3)",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    # NCBI sensitivity
    # collect:
    # average 5' per ancestor, r,

    ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)]
    list_collect = list()
    for r in ranges:

        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])

        df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter])
        # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r))

        df_summary_per_gcfid = df_summary_per_gcfid.groupby(
            "Ancestor", as_index=False).mean()
        df_summary_per_gcfid["Range"] = str(r)
        list_collect.append(df_summary_per_gcfid)

    df_tmp = pd.concat(list_collect, sort=False)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # do not average per gcfid - average per ancestor
    list_collect = list()

    range_avgs = list()
    range_label = list()

    for r in ranges:
        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])
        df_r = df[r_filter]

        for ancestor, df_group in df_r.groupby(
                "Ancestor", as_index=False):  # type: str, pd.DataFrame

            f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & (
                df_group["NCBI"])
            f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & (
                df_group["(GMS2=SBSP)!=NCBI"])

            sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float(
                f_gms2_eq_sbsp_with_ncbi_pred.sum())
            list_collect.append({
                "Ancestor":
                ancestor,
                "Range":
                str(r),
                "range_avg": (r[1] + r[0]) / 2.0,
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP":
                sensitivity,
                "GMS2=SBSP":
                f_gms2_eq_sbsp_with_ncbi_pred.sum()
            })

        range_label.append(r)
        range_avgs.append((r[1] + r[0]) / 2.0)

    df_tmp = pd.DataFrame(list_collect)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    ancestors = list(set(df_tmp["Ancestor"]))
    fig, axes = plt.subplots(
        len(ancestors),
        1,
        sharex="all",
    )
    for ancestor, ax in zip(ancestors, axes.ravel()):  # type: str, plt.Axes
        ax2 = ax.twinx()
        curr_df = df_tmp[df_tmp["Ancestor"] == ancestor]
        seaborn.lineplot("range_avg",
                         "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                         data=curr_df,
                         ax=ax)
        seaborn.lineplot("range_avg",
                         "GMS2=SBSP",
                         data=curr_df,
                         color='r',
                         legend=False,
                         ax=ax2)
        ax.set_ylabel(None)
        ax2.set_ylabel(None)
        ax.set_xlabel("Range Average")

    plt.xticks(range_avgs, range_label)
    plt.show()

    fig, ax = plt.subplots()
    ax2 = ax.twinx()
    seaborn.lineplot("range_avg",
                     "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                     data=df_tmp,
                     ax=ax,
                     color="b",
                     ci=None,
                     hue="Ancestor")
    seaborn.lineplot("range_avg",
                     "GMS2=SBSP",
                     data=df_tmp,
                     ci=None,
                     color='r',
                     legend=False,
                     ax=ax2,
                     hue="Ancestor")
    # plt.xticks(range_avgs, range_label)
    ax.set_ylim([0, None])
    ax2.set_ylim([0, None])

    ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP")
    ax2.set_ylabel("Number of GMS2=SBSP genes")
    ax.set_xlabel("Range Average")

    ax.yaxis.label.set_color('b')
    ax2.yaxis.label.set_color('r')
    ax.set_xlabel("Distance to upstream gene (nt)")
    plt.show()

    # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work)
    #
    # for ancestor, df_group in df.groupby("Ancestor", as_index=False):
    #     sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor)
    #     sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor)

    a = 0
def compare_distance_local_vs_global(env, df, **kwargs):
    # type: (Environment, pd.DataFrame, Dict[str, Any]) -> None
    pd_work = env["pd-work"]
    ext = get_value(kwargs, "extension", "png")
    fn_prefix = get_value(kwargs, "fn_prefix", "", default_if_none=True)

    df = df[df["global_distance"] < 0.5].copy()
    df = df[df["global_distance"] > 0.001].copy()
    df = df[df["global_length_without_gaps"] < 1100].copy()

    pf_distance = os.path.join(
        pd_work, "{}distance_local_vs_global.{}".format(fn_prefix, ext))
    pf_alignment_length = os.path.join(
        pd_work,
        "{}alignment_length_local_vs_global.{}".format(fn_prefix, ext))
    pf_ungapped_alignment_length = os.path.join(
        pd_work, "{}ungapped_alignment_length_local_vs_global.{}".format(
            fn_prefix, ext))

    pf_diff_distance_vs_ratio_length = os.path.join(
        pd_work, "{}diff_distance_vs_ratio_length.{}".format(fn_prefix, ext))
    pf_diff_distance_vs_ratio_ungapped_length = os.path.join(
        pd_work,
        "{}diff_distance_vs_ratio_ungapped_length.{}".format(fn_prefix, ext))

    # compare kimura local vs global
    scatter(df,
            "global_distance",
            "local_distance",
            figure_options=FigureOptions(
                title="Distance by local vs global alignment",
                xlabel="Global",
                ylabel="Local",
                xlim=[0, 0.8],
                ylim=[0, 0.8],
                save_fig=pf_distance,
                balanced=True),
            identity=True)

    # compare alignment length of local vs global
    scatter(df,
            "global_length",
            "local_length",
            figure_options=FigureOptions(
                title="Alignment length of local vs global",
                xlabel="Global",
                ylabel="Local",
                save_fig=pf_alignment_length,
                balanced=True),
            identity=True)

    # compare ungapped alignment length of local vs global

    scatter(df,
            "global_length_without_gaps",
            "local_length_without_gaps",
            figure_options=FigureOptions(
                title="Ungapped alignment length of local vs global",
                xlabel="Global",
                ylabel="Local",
                save_fig=pf_ungapped_alignment_length,
                balanced=True),
            identity=True)

    # compare difference in alignment length versus difference in local/global
    df["diff_distance"] = df["global_distance"] - df["local_distance"]

    df["ratio_ungapped_length"] = df["local_length_without_gaps"] / df[
        "global_length_without_gaps"]
    df["ratio_length"] = df["local_length"] / df["global_length"]

    scatter(df,
            "ratio_length",
            "diff_distance",
            figure_options=FigureOptions(
                title="Difference in distance vs ratio of alignment lengths",
                xlabel="Ratio of lengths",
                ylabel="Difference in distance",
                save_fig=pf_diff_distance_vs_ratio_length,
            ))

    scatter(
        df,
        "ratio_ungapped_length",
        "diff_distance",
        figure_options=FigureOptions(
            title=
            "Difference in distance vs ratio of ungapped alignment lengths",
            xlabel="Ratio of ungapped lengths",
            ylabel="Difference in distance",
            save_fig=pf_diff_distance_vs_ratio_ungapped_length,
        ))
示例#29
0
def plot_per_tool_by_genome_type(env, df):
    # type: (Environment, pd.DataFrame) -> None

    list_tags = get_tags_for_5prime(df)

    num_tags = len(list_tags)

    fig, ax = plt.subplots(2,
                           math.ceil(num_tags / 2),
                           sharey="all",
                           sharex="all")
    fig.add_axes([.91, .3, .03, .4])
    cbar_ax = fig.axes[-1]
    #
    # save_figure(FigureOptions(
    #     save_fig=next_name(env["pd-work"])
    #         ), fig)
    #
    # plt.show()
    # return

    import numpy as np
    kws = {
        # "levels": np.arange(0, 1, 0.2),
        # "vmin": 0, "vmax": 0.55,
        # "norm": True
        "xlim": [0.2, 0.8],
        "ylim": [0, 35],
        "cbar_max": 1,
        "num_steps": 35,
    }

    cbar_enable = {
        "cbar_ax": cbar_ax,
        "cbar": True,
    }

    counter = 0
    for tag, c, a in zip(list_tags, ["b", "g", "r", "o"], ax.ravel()):
        x, y, y_l, y_u = loess_with_stde(
            df, "GC", f"M:{tag}", a, tag.replace("=", ","), **kws,
            **cbar_enable if counter == 0 else dict())

        a.set_title(
            tag.replace("=",
                        ",").replace("NCBI",
                                     "PGAP").replace("GMS2", "GeneMarkS-2"))
        a.set_ylabel("")
        a.set_xlabel("")
        # a.set_ylim([65,100])
        # a.set_ylim([0, 35])
        # eps_x = [z for z in a.get_ylim()]
        # eps_x[0] -= 0.01
        # eps_x[1] += 0.01
        #
        # a.set_xlim(eps_x)
        # if counter % 2 == 0:
        #     a.set_ylabel("Percentage of gene-start differences")
        # if counter >= math.ceil(num_tags/2):
        #     a.set_xlabel("GC")
        counter += 1

        mappable = a.collections[0]

    # plt.legend(loc="best")
    figure_options = FigureOptions(save_fig=next_name(env["pd-work"]))
    fig.add_subplot(111, frameon=False)
    # hide tick and tick label of the big axes
    plt.tick_params(top=False,
                    bottom=False,
                    left=False,
                    right=False,
                    which="both",
                    labelbottom=False,
                    labeltop=False,
                    labelleft=False,
                    labelright=False)
    plt.xlabel("GC", labelpad=30)
    plt.ylabel("Percentage of gene-start differences", labelpad=30)
    # plt.xlabel("GC")
    # plt.ylabel("Percent 5' Match")

    # mappable=create_mappable_for_colorbar(np.arange(0, 0.4, 0.05), "Reds")
    # plt.colorbar(mappable, cax=cbar_ax, cmap="Reds")
    fig.tight_layout(rect=[-0.02, -0.02, .9, 1])

    # plt.tight_layout()
    # FigureOptions.set_properties_for_axis(ax, figure_options)

    save_figure(figure_options, fig)
    plt.show()
    #
    # for tag in list_tags:
    #     sns.jointplot(df, "GC", f"M:{tag}")
    #
    #
    # x = df["GC"].values
    # y = df[f"M:{list_tags[0]}"].values
    # order = np.argsort(x)
    # # run it
    # y_sm, y_std = lowess(x, y, f=1. / 5.)
    # # plot it
    # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS')
    # plt.fill_between(x[order], y_sm[order] - 1.96 * y_std[order],
    #                  y_sm[order] + 1.96 * y_std[order], alpha=0.3, label='LOWESS uncertainty')
    # # plt.plot(x, y, 'k.', label='Observations')
    # # plt.legend(loc='best')
    # # run it
    # y_sm, y_std = lowess(x, y, f=1. / 5.)
    # # plot it
    # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS')
    # plt.fill_between(x[order], y_sm[order] - y_std[order],
    #                  y_sm[order] + y_std[order], alpha=0.3, label='LOWESS uncertainty')
    # # plt.plot(x, y, 'k.', label='Observations')
    # plt.legend(loc='best')
    # plt.show()

    # calculate a 60 day rolling mean and plot
    # calculate a 60 day rolling mean and plot

    # df_stacked = stack_columns_as_rows(
    #     df, [f"M:{tag}" for tag in list_tags], "Percent 5p Match", [f"M:{tag}" for tag in list_tags], "Tools"
    # )
    #
    #
    # sns.lmplot(
    #     df_stacked, "GC", "Percent 5p Match", hue="Tools",
    #     figure_options=FigureOptions(
    #         xlabel="Genome GC",
    #         ylim=[70, 100]
    #     ),
    #     legend_loc="best",
    #     sns_kwargs={"scatter_kws": {"s": 5, "alpha": 0.3}, "lowess": False, "scatter": False, "aspect": 1.5}
    # )
    # # sns.tsplot(df_stacked, "GC", "Percent 5p Match", hue="Tools", sns_kwargs={"ci":"sd"})
    # fig, ax = plt.subplots(1, 1)
    # seaborn.lineplot(df["GC"], df[f"M:{list_tags[0]}"])
    # # seaborn.tsplot(df, "GC", f"M:{list_tags[0]}" , ci="sd")
    # plt.show()

    plt.show()
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    prl_options = ParallelizationOptions.init_from_dict(env, vars(args))

    if not prl_options["use-pbs"]:
        df = relative_entropy_analysis(env, gil, prl_options)
    else:
        pbs = PBS(env,
                  prl_options,
                  splitter=split_genome_info_list,
                  merger=merge_identity)
        list_df = pbs.run(data={"gil": gil},
                          func=relative_entropy_analysis,
                          func_kwargs={
                              "env": env,
                              "prl_options": prl_options
                          })
        df = pd.concat(list_df, ignore_index=True, sort=False)

    df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False)

    pd_figures = os_join(env["pd-work"], "summary_figures")
    mkdir_p(pd_figures)

    sns.scatterplot(df,
                    "Percent",
                    "Error",
                    figure_options=FigureOptions(
                        ylim=[0, 20], save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Motif",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Spacer",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.scatterplot(
        df,
        "RE Motif",
        "RE Spacer",
        hue="Genome",
        identity=True,
        figure_options=FigureOptions(save_fig=next_name(pd_figures)))

    sns.lmplot(df,
               "Percent",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Motif",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Spacer",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "Percent",
               "RE",
               hue="Genome",
               figure_options=FigureOptions(save_fig=next_name(pd_figures)))