def scatterplot(df, x, y, hue=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) ax = get_value(kwargs, "ax", None) identity = get_value(kwargs, "identity", False) if not ax: _, ax = plt.subplots() g = sns.scatterplot(x=x, y=y, hue=hue, data=df, linewidth=0, **sns_kwargs) if identity: add_identity(ax, color="r", ls="--") FigureOptions.set_properties_for_axis(ax, figure_options) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title) else: plt.legend(loc=legend_loc) save_figure(figure_options) plt.show()
def venn_diagram_5prime(labels_a, labels_b, labels_c, figure_options=None): # type: (Labels, Labels, Labels, FigureOptions) -> None # first, reduce each set to common genes list_labels_common_3prime = reduce_labels_to_genes_in_all( [labels_a, labels_b, labels_c]) label_value_pair = numbers_for_3d_venn(*list_labels_common_3prime) fig, ax = plt.subplots() # venn3([set(get_set_gene_keys(labels)) for labels in list_labels_common_3prime], # set_labels=[labels.name for labels in list_labels_common_3prime]) # create equal sized circles v = venn3([1, 1, 1, 1, 1, 1, 1], set_labels=[labels.name for labels in list_labels_common_3prime]) for key, value in label_value_pair.items(): v.get_label_by_id(key).set_text(value) # Add title and annotation FigureOptions.set_properties_for_axis(ax, figure_options) if figure_options is not None and figure_options.save_fig is not None: plt.savefig(figure_options.save_fig, bbox_inches='tight') # Show it plt.show()
def catplot(df, x, y, hue=None, kind="box", figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) g = sns.catplot(x=x, y=y, data=df, kind=kind, hue=hue, legend=False, aspect=1.5, **sns_kwargs) if kind == "point": plt.setp(g.ax.lines, linewidth=1) # set lw for all lines of g axes # plt.setp(g.ax.lines, markersize=0) # set lw for all lines of g axes # # if fontsize: # g.set_xlabels(x, fontsize=fontsize) # g.set_ylabels(x, fontsize=fontsize) FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title) else: plt.legend(loc=legend_loc) # plt.savefig(next_name(pd_work)) save_figure(figure_options) plt.show()
def analyze_kimura_distances(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = env["pd-work"] df = df[df["Kimura-to-query"] != "[]"].copy() df["Kimura-to-query"] = df["Kimura-to-query"].apply(ast.literal_eval) df["Average-Kimura"] = df["Kimura-to-query"].apply(np.mean) df["Std-Kimura"] = df["Kimura-to-query"].apply(np.std) sns.lmplot(df, "Genome GC", "Average-Kimura", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "scatter_kws": { "s": 5 }, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work))) df_mean = df.groupby(["Ancestor", "GCFID"], as_index=False).mean() sns.lmplot(df_mean, "Genome GC", "Average-Kimura", hue="Ancestor", sns_kwargs={ "scatter": True, "lowess": True, "scatter_kws": { "s": 5 }, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work))) # Min/max kimura df["Min-Kimura"] = df["Kimura-to-query"].apply(min) df["Max-Kimura"] = df["Kimura-to-query"].apply(max) contour_kimura_per_ancestor(env, df) one_dim_Kimura_accuracy(env, df) kimura_dist_plot(env, df) heat_map_Kimura_accuracy(env, df, "Min-Kimura", "Max-Kimura", balance=True, xlabel="Minimum Kimura", ylabel="Maximum Kimura") heat_map_Kimura_accuracy(env, df, "Average-Kimura", "Std-Kimura", balance=False)
def plot_catplot(df, column_x, column_y, figure_options=None): _, ax = plt.subplots() sns.catplot(x=column_x, y=column_y, kind="bar", data=df) FigureOptions.set_properties_for_axis(ax, figure_options) if figure_options is not None and figure_options.save_fig is not None: plt.savefig(figure_options.save_fig, bbox_index="tight") plt.show()
def distplot(df, x, figure_options=None, **kwargs): _, ax = plt.subplots() sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) if "kde" not in sns_kwargs: sns_kwargs["kde"] = True g = sns.distplot(df[x], bins=50, **sns_kwargs) FigureOptions.set_properties_for_axis(g.axes, figure_options) save_figure(figure_options) plt.show()
def barplot(df, x, y, hue, figure_options=None, **kwargs): sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) ax = get_value(kwargs, "ax", None) g = sns.barplot(x=x, y=y, data=df, hue=hue, ax=ax, **sns_kwargs) if hue is not None: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5)) FigureOptions.set_properties_for_axis(g, figure_options) plt.tight_layout() save_figure(figure_options) # plt.tight_layout(rect=[-0.3,0,1,1.2]) plt.show()
def kdeplot(df, x, y, hue=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) _, ax = plt.subplots() y_df = None if y is None else df[y] g = sns.kdeplot(df[x], y_df, legend=False, **sns_kwargs) if hue is not None: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5)) FigureOptions.set_properties_for_axis(ax, figure_options) save_figure(figure_options) plt.show()
def plot_scatter_for_columns_from_files(env, pf_data, column_names, delimiter=",", **kwargs): # type: (Environment, str, list[str], str, **str) -> None filter_by_equal = get_value(kwargs, "filter_by_equal", None) scatter_separately = get_value(kwargs, "scatter_in_separate_files", False) limit_x_axis_features = get_value(kwargs, "limit_x_axis_features", None) color_by_value = get_value(kwargs, "color_by_value", None) title = get_value(kwargs, "title", None) df = pd.read_csv(pf_data, delimiter=delimiter) if filter_by_equal is not None: filter_column_name, value = filter_by_equal df = filter_dataframe_by_equal(df, filter_column_name, value) if scatter_separately: x_axis_column_names = column_names if limit_x_axis_features is not None: x_axis_column_names = limit_x_axis_features for f1 in x_axis_column_names: for f2 in column_names: plot_scatter_for_dataframe_columns( df, [f1, f2], color_by_value=color_by_value, figure_options=FigureOptions( title=title, save_fig=os.path.join(env["pd-work-results"], "scatter_{}_{}".format(f1, f2)))) else: if color_by_value is not None: plot_scatter_matrix( df, column_names, color_by=color_by_value, figure_options=FigureOptions(save_fig=os.path.join( env["pd-work-results"], "scatter.pdf"))) else: plot_scatter_matrix_for_dataframe_columns( df, column_names, figure_options=FigureOptions(save_fig=os.path.join( env["pd-work-results"], "scatter.pdf")))
def analyze_gms2_components_on_verified_set(env, gil): # type: (Environment, GenomeInfoList) -> None # run different components list_df = list() for gi in gil: list_df.append( analyze_gms2_components_on_verified_set_for_gi(env, gi) ) df = pd.concat(list_df, ignore_index=True, sort=False) df["Genome"] = df.apply(fix_names, axis=1) print(df.to_csv()) fig, ax = plt.subplots(figsize=(12,4)) sns.barplot(df, "Genome", "Error", hue="Component", ax=ax, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) ), sns_kwargs={ "hue_order": reversed(["GMS2", "MGM2*", "Start Context", "RBS", "Start Codons", "Promoter", "MGM"]), "palette": CM.get_map("gms2_components") })
def kimura_dist_plot(env, df): import seaborn import matplotlib.pyplot as plt ancestors = list(set(df["Ancestor"])) # fig, axes = plt.subplots(2, math.ceil(len(ancestors)/2), sharex=True, sharey=True) # # for anc, ax in zip(ancestors, axes.ravel()): # # df_group = df[df["Ancestor"] == anc] # seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc], # hist=False) # ax.set_title(anc) # plt.show() fig, ax = plt.subplots() # type: plt.Figure, plt.Axes for anc in ancestors: df_group = df[df["Ancestor"] == anc] seaborn.distplot(df_group["Average-Kimura"], ax=ax, color=CM.get_map("ancestor")[anc], hist=False, label=anc) # ax.set_title(anc) ax.legend(ancestors) ax.set_ylabel("PDF") save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs): # type: (Environment, str, str, str, Dict[str, Any]) -> None venn_title = get_value(kwargs, "venn_title", None) pf_venn = get_value(kwargs, "pf_venn", os.path.join(env["pd-work"], "venn.pdf")) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2") labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP") labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI") lcd = LabelsComparisonDetailed(labels_gms2, labels_sbsp, name_a="gms2", name_b="sbsp") labels_gms2_sbsp_3p_5p = lcd.intersection("a") lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p, labels_ncbi, name_a="gms2_sbsp", name_b="ncbi") labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a") out = "gms2,sbsp,ncbi,gms2_sbsp,gms2_sbsp_ncbi" out += "\n{},{},{},{},{}".format(len(labels_gms2), len(labels_sbsp), len(labels_ncbi), len(labels_gms2_sbsp_3p_5p), len(labels_gms2_sbsp_ncbi_3p_5p)) print(out) venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi, FigureOptions(title=venn_title, save_fig=pf_venn))
def logo_rbs_from_gms2_mod_file(pd_figures, pf_mod, title=""): # type: (str, str, str) -> None mod = GMS2Mod.init_from_file(pf_mod) mm = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"]) non = GMS2Noncoding(mod.items["NON_MAT"]) import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2) import logomaker as lm lm.Logo(lm.transform_matrix(mm.pwm_to_df(), from_type="probability", to_type="information", background=non.pwm_to_array(0)), ax=axes[0]) axes[0].set_title(title) axes[0].set_ylim(0, 2) df_spacer = pd.DataFrame({ "Distance from start": range(len(mm._spacer)), "Probability": mm._spacer }) sns.lineplot(df_spacer, "Distance from start", "Probability", ax=axes[1], figure_options=FigureOptions(ylim=[0, 0.4])) plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show()
def main(env, args): # type: (Environment, argparse.Namespace) -> None df = pd.read_csv(args.pf_stats) compute_more(df) fo = FigureOptions(ylim=[0, 700000]) viz_per_genome(env, df)
def analyze_by_step_group(df, pd_work, fn_prefix, tag): # type: (pd.DataFrame, str, str, str) -> None list_df = list() for index in df.index: curr_df = pd.DataFrame(df.at[index, "by_step_group_{}".format(tag)]) curr_df["Genome"] = df.at[index, "Genome"] if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}: continue list_df.append(curr_df) df_acc = pd.concat(list_df) sns.catplot( df_acc, "Step Group", "Percentage 3p match: Verified from {}".format(tag), hue="Genome", kind="point", figure_options=FigureOptions( title="Percentage 3p match versus minimum support", ylabel="Percentage of 3p match", save_fig=next_name(pd_work), ylim=[None, 100.5]), ) sns.catplot( df_acc, "Step Group", "Percentage 5p-3p match: Verified from {}".format(tag), kind="point", hue="Genome", figure_options=FigureOptions( title="Percentage 5p-3p match versus minimum support", ylabel="Percentage of 5p-3p match", save_fig=next_name(pd_work), ylim=[90, 100.5]), ) print(df_acc.to_string())
def scatter(df, column_x, column_y, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, FigureOptions, Dict[str, Any]) -> None column_z = get_value(kwargs, "column_z", None) identity = get_value(kwargs, "identity", False) hue = df[column_z] if column_z is not None else None _, ax = plt.subplots() sns.jointplot(df[column_x], df[column_y], kind="scatter", alpha=0.3, s=10, linewidth=0) #sns.scatterplot(df[column_x], df[column_y], hue=hue, alpha=0.3, s=10, linewidth=0) if identity: add_identity(ax, color="r", ls="--") FigureOptions.set_properties_for_axis(ax, figure_options) if figure_options is not None and figure_options.save_fig is not None: plt.savefig(figure_options.save_fig, bbox_index="tight") plt.show()
def df_plot_scatter_matrix(env, df, column_names, **kwargs): # type: (Environment, pd.DataFrame, Union[List, Set], Dict[str, Any]) -> None color_by_value = get_value(kwargs, "color_by_value", None) if color_by_value is not None: plot_scatter_matrix( df, column_names, color_by=color_by_value, figure_options=FigureOptions( save_fig=os.path.join(env["pd-work-results"], "scatter.pdf")), **kwargs) else: plot_scatter_matrix( df, column_names, color_by=color_by_value, figure_options=FigureOptions( save_fig=os.path.join(env["pd-work-results"], "scatter.pdf")), **kwargs)
def plot_hist_by_group(df_data, column_x, column_group=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None bins = get_value(kwargs, "bins", 10) _, ax = plt.subplots() cumulative = get_value(kwargs, "cumulative", False) shade = False if cumulative else True cut = [min(df_data[column_x]), max(df_data[column_x])] if column_group is not None: for name, df_group in df_data.groupby(column_group): sns.distplot(df_group[column_x], hist=False, kde_kws={ "shade": shade, "cumulative": cumulative }, label=name) else: # sns.distplot(df_data[column_x], hist=True, kde_kws={"shade": shade, "cumulative": cumulative, "clip": cut}) sns.distplot(df_data[column_x], bins=bins, hist=True, kde=False, hist_kws={"edgecolor": "black"}) FigureOptions.set_properties_for_axis(ax, figure_options) # plt.xlim([min(df_data[column_x]), max(df_data[column_x])]) if figure_options is not None and figure_options.save_fig is not None: plt.savefig(figure_options.save_fig, bbox_index="tight") plt.show()
def lmplot(df, x, y, hue=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) if "aspect" not in sns_kwargs: sns_kwargs["aspect"] = 2 g = sns.lmplot(x=x, y=y, hue=hue, data=df, legend=False, **sns_kwargs) FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: g.axes[0][0].legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title) else: g.axes[0][0].legend(loc=legend_loc) save_figure(figure_options, fig=g.fig) plt.subplots_adjust(right=1) plt.show() return g
def analyze_by_support(df, pd_work, fn_prefix, tag): # type: (pd.DataFrame, str, str, str) -> None list_df = list() for index in df.index: curr_df = pd.DataFrame(df.at[index, "by_support_{}".format(tag)]) curr_df["Genome"] = df.at[index, "Genome"] if df.at[index, "Genome"] in {"A. pernix", "Synechocystis"}: continue list_df.append(curr_df) df_acc = pd.concat(list_df) sns.lineplot( df_acc, "Min Support", "Percentage 3p match: Verified from {}".format(tag), hue="Genome", figure_options=FigureOptions( title="Percentage of verified genes predicted\nby {}".format(tag), ylabel="Percentage", save_fig=next_name(pd_work), ylim=[None, 100.5])) sns.lineplot( df_acc, "Min Support", "Percentage 5p-3p match: Verified from {}".format(tag), hue="Genome", figure_options=FigureOptions( title="Percentage of predicted {} genes\nwith correct 5' end". format(tag), ylabel="Percentage of 5p-3p match", save_fig=next_name(pd_work), ylim=[90, 100.5]))
def lineplot(df, x, y, hue=None, figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) ax = get_value(kwargs, "ax", None) show = get_value(kwargs, "show", ax is None) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) legend_ncol = get_value(kwargs, "legend_ncol", 1) identity = get_value(kwargs, "identity", False) if not ax: fig, ax = plt.subplots() else: fig = ax.get_figure() g = sns.lineplot(x=x, y=y, hue=hue, data=df, ax=ax, legend=legend, **sns_kwargs) if identity: add_identity(ax, color="r", ls="--") FigureOptions.set_properties_for_axis(ax, figure_options) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title, ncol=legend_ncol) else: plt.legend(loc=legend_loc, ncol=legend_ncol, title=title) if title is not None and len(title) == 0: handles, labels = ax.get_legend_handles_labels() ax.legend(handles=handles[1:], labels=labels[1:], ncol=legend_ncol) if show: save_figure(figure_options, fig) plt.show()
def _histogram_multiple_stats_summary_by_attribute(self, list_df, pd_output): # type: (List[Tuple[str, pd.DataFrame]], str) -> None # merge df and add value df = pd.DataFrame() for item in list_df: value, curr_df = item curr_df["step"] = value df = df.append(curr_df, ignore_index=True) plot_catplot( df, "step", "% Common 5'", FigureOptions(save_fig=os.path.join(pd_output, "histogram.pdf")))
def contour_kimura_per_ancestor(env, df): import seaborn import matplotlib.pyplot as plt ancestors = sorted(list(set(df["Ancestor"]))) fig, axes = plt.subplots(2, math.ceil(len(ancestors) / 2), sharex=True, sharey=True, figsize=(6, 6)) for anc, ax in zip(ancestors, axes.ravel()): df_group = df[df["Ancestor"] == anc] seaborn.kdeplot(df_group["Min-Kimura"].values, df_group["Max-Kimura"].values, ax=ax) ax.set_title(anc) # ax.set_ylim([0.45, 0.525]) # fig.xlabel("Min-Kimura") # plt.xlabel("Min-Kimura") # plt.ylabel("Max-Kimura") # fig.text(0.5, 0.04, 'Min-Kimura', ha='center') # fig.text(0.04, 0.5, 'Max-Kimura', va='center', rotation='vertical') fig.add_subplot(111, frameon=False) # # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel("Minimum Kimura", labelpad=20) plt.ylabel("Maximum Kimura", labelpad=30) fig.tight_layout() save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.show()
def viz_summary_per_gcfid_per_step(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = env['pd-work'] list_df = list() for step in ["A", "B", "C"]: df_summary_per_gcfid = get_summary_per_gcfid( df[df["Predicted-at-step"] == step]) df_summary_per_gcfid["SBSP Step"] = step list_df.append(df_summary_per_gcfid) df_per_gcfid_per_step = pd.concat(list_df, sort=False) sns.catplot(df_per_gcfid_per_step, "Ancestor", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="SBSP Step", kind="box", legend_loc="best", figure_options=FigureOptions(save_fig=next_name(pd_work), xlabel="Clade", ylabel="Err(NCBI,GMS2=SBSP)"))
def plot_histograms_for_columns(env, df_data, column_names, **kwargs): # type: (Environment, pd.DataFrame, List[str], Dict[str, Any]) -> None group_by = get_value(kwargs, "group_by", None) title_name = get_value(kwargs, "title_name", "", default_if_none=True) xlim = get_value(kwargs, "xlim", None) for c in column_names: plot_hist_by_group(df_data, c, group_by, figure_options=FigureOptions( xlabel=c, title="{}".format(title_name), ylabel="Frequency", save_fig=os.path.join( env["pd-work"], "hist{}.pdf".format( c.replace(" ", "_").replace("(", "").replace( ")", ""))), xlim=xlim), bins=get_value(kwargs, "bins", 10))
def df_plot_scatter_separate(env, df, column_pairs, **kwargs): # type: (Environment, pd.DataFrame, List[List], Dict[str, Any]) -> None color_by_value = get_value(kwargs, "color_by_value", None) limit_x_axis_features = get_value(kwargs, "limit_x_axis_features", None) jitter = get_value(kwargs, "jitter", None) title = get_value(kwargs, "title", None) if limit_x_axis_features is not None: column_pairs = [ x for x in column_pairs if x[0] in limit_x_axis_features ] for f1, f2 in column_pairs: plot_scatter_for_dataframe_columns( df, [f1, f2], color_by_value=color_by_value, figure_options=FigureOptions(save_fig=os.path.join( env["pd-work-results"], "scatter_{}_{}.pdf".format(f1, f2)), xlabel=f1, ylabel=f2, title=title))
def analyze_upstream_distances(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = os_join(env["pd-work"], "upstream_distances") mkdir_p(pd_work) # remove empty lists df = df[df["Upstream-distance"] != "[]"].copy() df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval) df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent) # compute consistencies with different flexibilities for flexibility in {0, 3}: df["PC(x,{})".format(flexibility)] = df[[ "Most frequent upstream", "Upstream-distance" ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[ "Most frequent upstream"], flexibility), axis=1) df = df[df["Support"] > 10].copy() # for mf in range(-20, 50): # df_mf = df[df["Most frequent upstream"] == mf] # if len(df_mf) < 50: # continue # # sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 0), # save_fig=next_name(pd_work), # xlim=(0,1) # )) # sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 3), # save_fig=next_name(pd_work), # xlim=(0, 1) # )) # plot distribution of Average PC import seaborn import matplotlib.pyplot as plt df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] # NCBI consistency as a func df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] df_tmp = stack_columns_as_rows( df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)", "Ancestor"]], ["PC(x,0)", "PC(x,3)"], "PC(x,f)", None, label_col="Flexibility") # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility") # plt.show() sns.lmplot(df_tmp, "Most frequent upstream", "PC(x,f)", hue="Flexibility", sns_kwargs={ "scatter": False, "lowess": True }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) sns.distplot(df, "Most frequent upstream", figure_options=FigureOptions(save_fig=next_name(pd_work)), sns_kwargs={"kde": True}) import seaborn # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor") (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename( 'Percentage (by clade)').reset_index().pipe( (seaborn.catplot, 'data'), x="Most frequent upstream", y='Percentage (by clade)', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Percent of components (by clade)") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts().rename( 'number').reset_index().pipe((seaborn.catplot, 'data'), x="Most frequent upstream", y='number', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Number of components") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() f, ax1 = plt.subplots() ax2 = ax1.twinx() for ancestor, df_group in df.groupby("Ancestor"): seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1) # ax2.set_ylim(0, 3) ax2.yaxis.set_ticks([]) seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2) ax1.set_xlabel('x var') ax1.set_ylabel('Counts') # g = seaborn.FacetGrid(df, hue="Ancestor") # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True) plt.show() print(df["Most frequent upstream"].value_counts(normalize=True)) sns.lmplot( df, "Most frequent upstream", "PC(x,0)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1]), ) sns.lmplot(df, "Most frequent upstream", "PC(x,3)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) # NCBI sensitivity # collect: # average 5' per ancestor, r, ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)] list_collect = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter]) # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r)) df_summary_per_gcfid = df_summary_per_gcfid.groupby( "Ancestor", as_index=False).mean() df_summary_per_gcfid["Range"] = str(r) list_collect.append(df_summary_per_gcfid) df_tmp = pd.concat(list_collect, sort=False) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) # do not average per gcfid - average per ancestor list_collect = list() range_avgs = list() range_label = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_r = df[r_filter] for ancestor, df_group in df_r.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & ( df_group["NCBI"]) f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & ( df_group["(GMS2=SBSP)!=NCBI"]) sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float( f_gms2_eq_sbsp_with_ncbi_pred.sum()) list_collect.append({ "Ancestor": ancestor, "Range": str(r), "range_avg": (r[1] + r[0]) / 2.0, "(GMS2=SBSP)!=NCBI % GMS2=SBSP": sensitivity, "GMS2=SBSP": f_gms2_eq_sbsp_with_ncbi_pred.sum() }) range_label.append(r) range_avgs.append((r[1] + r[0]) / 2.0) df_tmp = pd.DataFrame(list_collect) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) ancestors = list(set(df_tmp["Ancestor"])) fig, axes = plt.subplots( len(ancestors), 1, sharex="all", ) for ancestor, ax in zip(ancestors, axes.ravel()): # type: str, plt.Axes ax2 = ax.twinx() curr_df = df_tmp[df_tmp["Ancestor"] == ancestor] seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=curr_df, ax=ax) seaborn.lineplot("range_avg", "GMS2=SBSP", data=curr_df, color='r', legend=False, ax=ax2) ax.set_ylabel(None) ax2.set_ylabel(None) ax.set_xlabel("Range Average") plt.xticks(range_avgs, range_label) plt.show() fig, ax = plt.subplots() ax2 = ax.twinx() seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=df_tmp, ax=ax, color="b", ci=None, hue="Ancestor") seaborn.lineplot("range_avg", "GMS2=SBSP", data=df_tmp, ci=None, color='r', legend=False, ax=ax2, hue="Ancestor") # plt.xticks(range_avgs, range_label) ax.set_ylim([0, None]) ax2.set_ylim([0, None]) ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP") ax2.set_ylabel("Number of GMS2=SBSP genes") ax.set_xlabel("Range Average") ax.yaxis.label.set_color('b') ax2.yaxis.label.set_color('r') ax.set_xlabel("Distance to upstream gene (nt)") plt.show() # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work) # # for ancestor, df_group in df.groupby("Ancestor", as_index=False): # sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor) # sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor) a = 0
def compare_distance_local_vs_global(env, df, **kwargs): # type: (Environment, pd.DataFrame, Dict[str, Any]) -> None pd_work = env["pd-work"] ext = get_value(kwargs, "extension", "png") fn_prefix = get_value(kwargs, "fn_prefix", "", default_if_none=True) df = df[df["global_distance"] < 0.5].copy() df = df[df["global_distance"] > 0.001].copy() df = df[df["global_length_without_gaps"] < 1100].copy() pf_distance = os.path.join( pd_work, "{}distance_local_vs_global.{}".format(fn_prefix, ext)) pf_alignment_length = os.path.join( pd_work, "{}alignment_length_local_vs_global.{}".format(fn_prefix, ext)) pf_ungapped_alignment_length = os.path.join( pd_work, "{}ungapped_alignment_length_local_vs_global.{}".format( fn_prefix, ext)) pf_diff_distance_vs_ratio_length = os.path.join( pd_work, "{}diff_distance_vs_ratio_length.{}".format(fn_prefix, ext)) pf_diff_distance_vs_ratio_ungapped_length = os.path.join( pd_work, "{}diff_distance_vs_ratio_ungapped_length.{}".format(fn_prefix, ext)) # compare kimura local vs global scatter(df, "global_distance", "local_distance", figure_options=FigureOptions( title="Distance by local vs global alignment", xlabel="Global", ylabel="Local", xlim=[0, 0.8], ylim=[0, 0.8], save_fig=pf_distance, balanced=True), identity=True) # compare alignment length of local vs global scatter(df, "global_length", "local_length", figure_options=FigureOptions( title="Alignment length of local vs global", xlabel="Global", ylabel="Local", save_fig=pf_alignment_length, balanced=True), identity=True) # compare ungapped alignment length of local vs global scatter(df, "global_length_without_gaps", "local_length_without_gaps", figure_options=FigureOptions( title="Ungapped alignment length of local vs global", xlabel="Global", ylabel="Local", save_fig=pf_ungapped_alignment_length, balanced=True), identity=True) # compare difference in alignment length versus difference in local/global df["diff_distance"] = df["global_distance"] - df["local_distance"] df["ratio_ungapped_length"] = df["local_length_without_gaps"] / df[ "global_length_without_gaps"] df["ratio_length"] = df["local_length"] / df["global_length"] scatter(df, "ratio_length", "diff_distance", figure_options=FigureOptions( title="Difference in distance vs ratio of alignment lengths", xlabel="Ratio of lengths", ylabel="Difference in distance", save_fig=pf_diff_distance_vs_ratio_length, )) scatter( df, "ratio_ungapped_length", "diff_distance", figure_options=FigureOptions( title= "Difference in distance vs ratio of ungapped alignment lengths", xlabel="Ratio of ungapped lengths", ylabel="Difference in distance", save_fig=pf_diff_distance_vs_ratio_ungapped_length, ))
def plot_per_tool_by_genome_type(env, df): # type: (Environment, pd.DataFrame) -> None list_tags = get_tags_for_5prime(df) num_tags = len(list_tags) fig, ax = plt.subplots(2, math.ceil(num_tags / 2), sharey="all", sharex="all") fig.add_axes([.91, .3, .03, .4]) cbar_ax = fig.axes[-1] # # save_figure(FigureOptions( # save_fig=next_name(env["pd-work"]) # ), fig) # # plt.show() # return import numpy as np kws = { # "levels": np.arange(0, 1, 0.2), # "vmin": 0, "vmax": 0.55, # "norm": True "xlim": [0.2, 0.8], "ylim": [0, 35], "cbar_max": 1, "num_steps": 35, } cbar_enable = { "cbar_ax": cbar_ax, "cbar": True, } counter = 0 for tag, c, a in zip(list_tags, ["b", "g", "r", "o"], ax.ravel()): x, y, y_l, y_u = loess_with_stde( df, "GC", f"M:{tag}", a, tag.replace("=", ","), **kws, **cbar_enable if counter == 0 else dict()) a.set_title( tag.replace("=", ",").replace("NCBI", "PGAP").replace("GMS2", "GeneMarkS-2")) a.set_ylabel("") a.set_xlabel("") # a.set_ylim([65,100]) # a.set_ylim([0, 35]) # eps_x = [z for z in a.get_ylim()] # eps_x[0] -= 0.01 # eps_x[1] += 0.01 # # a.set_xlim(eps_x) # if counter % 2 == 0: # a.set_ylabel("Percentage of gene-start differences") # if counter >= math.ceil(num_tags/2): # a.set_xlabel("GC") counter += 1 mappable = a.collections[0] # plt.legend(loc="best") figure_options = FigureOptions(save_fig=next_name(env["pd-work"])) fig.add_subplot(111, frameon=False) # hide tick and tick label of the big axes plt.tick_params(top=False, bottom=False, left=False, right=False, which="both", labelbottom=False, labeltop=False, labelleft=False, labelright=False) plt.xlabel("GC", labelpad=30) plt.ylabel("Percentage of gene-start differences", labelpad=30) # plt.xlabel("GC") # plt.ylabel("Percent 5' Match") # mappable=create_mappable_for_colorbar(np.arange(0, 0.4, 0.05), "Reds") # plt.colorbar(mappable, cax=cbar_ax, cmap="Reds") fig.tight_layout(rect=[-0.02, -0.02, .9, 1]) # plt.tight_layout() # FigureOptions.set_properties_for_axis(ax, figure_options) save_figure(figure_options, fig) plt.show() # # for tag in list_tags: # sns.jointplot(df, "GC", f"M:{tag}") # # # x = df["GC"].values # y = df[f"M:{list_tags[0]}"].values # order = np.argsort(x) # # run it # y_sm, y_std = lowess(x, y, f=1. / 5.) # # plot it # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS') # plt.fill_between(x[order], y_sm[order] - 1.96 * y_std[order], # y_sm[order] + 1.96 * y_std[order], alpha=0.3, label='LOWESS uncertainty') # # plt.plot(x, y, 'k.', label='Observations') # # plt.legend(loc='best') # # run it # y_sm, y_std = lowess(x, y, f=1. / 5.) # # plot it # plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS') # plt.fill_between(x[order], y_sm[order] - y_std[order], # y_sm[order] + y_std[order], alpha=0.3, label='LOWESS uncertainty') # # plt.plot(x, y, 'k.', label='Observations') # plt.legend(loc='best') # plt.show() # calculate a 60 day rolling mean and plot # calculate a 60 day rolling mean and plot # df_stacked = stack_columns_as_rows( # df, [f"M:{tag}" for tag in list_tags], "Percent 5p Match", [f"M:{tag}" for tag in list_tags], "Tools" # ) # # # sns.lmplot( # df_stacked, "GC", "Percent 5p Match", hue="Tools", # figure_options=FigureOptions( # xlabel="Genome GC", # ylim=[70, 100] # ), # legend_loc="best", # sns_kwargs={"scatter_kws": {"s": 5, "alpha": 0.3}, "lowess": False, "scatter": False, "aspect": 1.5} # ) # # sns.tsplot(df_stacked, "GC", "Percent 5p Match", hue="Tools", sns_kwargs={"ci":"sd"}) # fig, ax = plt.subplots(1, 1) # seaborn.lineplot(df["GC"], df[f"M:{list_tags[0]}"]) # # seaborn.tsplot(df, "GC", f"M:{list_tags[0]}" , ci="sd") # plt.show() plt.show()
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) if not prl_options["use-pbs"]: df = relative_entropy_analysis(env, gil, prl_options) else: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) list_df = pbs.run(data={"gil": gil}, func=relative_entropy_analysis, func_kwargs={ "env": env, "prl_options": prl_options }) df = pd.concat(list_df, ignore_index=True, sort=False) df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False) pd_figures = os_join(env["pd-work"], "summary_figures") mkdir_p(pd_figures) sns.scatterplot(df, "Percent", "Error", figure_options=FigureOptions( ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.scatterplot( df, "RE Motif", "RE Spacer", hue="Genome", identity=True, figure_options=FigureOptions(save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "RE", hue="Genome", figure_options=FigureOptions(save_fig=next_name(pd_figures)))