def n50_barplot(df, figformat, path, title=None, palette=None): n50_bar = Plot(path=path + "NanoComp_N50." + figformat, title="Comparing read length N50") if "aligned_lengths" in df: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"])) for d in df["dataset"].unique() ] ylabel = 'Total gigabase aligned' else: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"])) for d in df["dataset"].unique() ] ylabel = 'Sequenced read length N50' ax = sns.barplot(x=list(df["dataset"].unique()), y=n50s, palette=palette, order=df["dataset"].unique()) ax.set(ylabel=ylabel, title=title or n50_bar.title) plt.xticks(rotation=30, ha='center') n50_bar.fig = ax.get_figure() n50_bar.save(format=figformat) plt.close("all") return [n50_bar]
def sequencing_speed_over_time(dfs, path, title, settings, color="#4CB391"): time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot.html", title="Violin plot of sequencing speed over time") mask = dfs['duration'] != 0 fig = go.Figure() fig.add_trace( go.Violin(x=dfs.loc[mask, "timebin"], y=dfs.loc[mask, "lengths"] / dfs.loc[mask, "duration"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Sequencing speed (nucleotides/second)', title=title or time_duration.title, title_x=0.5) fig.update_xaxes(tickangle=45) time_duration.fig = fig time_duration.html = time_duration.fig.to_html(full_html=False, include_plotlyjs='cdn') time_duration.save(settings) return time_duration
def cumulative_yield(dfs, path, figformat, title, color): cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases." + figformat, title="Cumulative yield") s = dfs.loc[:, "lengths"].cumsum().resample('1T').max() / 1e9 ax = sns.regplot(x=s.index.total_seconds() / 3600, y=s, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set(xlabel='Run time (hours)', ylabel='Cumulative yield in gigabase', title=title or cum_yield_gb.title) cum_yield_gb.fig = ax.get_figure() cum_yield_gb.save(format=figformat) plt.close("all") cum_yield_reads = Plot(path=path + "CumulativeYieldPlot_NumberOfReads." + figformat, title="Cumulative yield") s = dfs.loc[:, "lengths"].resample('10T').count().cumsum() ax = sns.regplot(x=s.index.total_seconds() / 3600, y=s, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set(xlabel='Run time (hours)', ylabel='Cumulative yield in number of reads', title=title or cum_yield_reads.title) cum_yield_reads.fig = ax.get_figure() cum_yield_reads.save(format=figformat) plt.close("all") return [cum_yield_gb, cum_yield_reads]
def length_over_time(dfs, path, figformat, title, log_length=False, plot_settings={}): time_length = Plot(path=path + "TimeLengthViolinPlot." + figformat, title="Violin plot of read lengths over time") sns.set(style="white", **plot_settings) if log_length: length_column = "log_lengths" else: length_column = "lengths" if "length_filter" in dfs: # produced by NanoPlot filtering of too long reads temp_dfs = dfs[dfs["length_filter"]] else: temp_dfs = dfs ax = sns.violinplot(x="timebin", y=length_column, data=temp_dfs, inner=None, cut=0, linewidth=0) ax.set(xlabel='Interval (hours)', ylabel="Read length", title=title or time_length.title) if log_length: ticks = [10**i for i in range(10) if not 10**i > 10 * np.amax(dfs["lengths"])] ax.set(yticks=np.log10(ticks), yticklabels=ticks) plt.xticks(rotation=45, ha='center', fontsize=8) time_length.fig = ax.get_figure() time_length.save(format=figformat) plt.close("all") return time_length
def overlay_histogram_phred(df, path, settings, palette=None): """ Reads with a perfect alignment and thus a percentIdentity of 100 get a phred score of Inf Which is not cool So these are set to 60, a very high phred score """ df["phredIdentity"] = -10 * np.log10(1 - (df["percentIdentity"] / 100)) df["phredIdentity"][np.isinf(df["phredIdentity"])] = 60 if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist_phred = Plot(path=path + "NanoComp_OverlayHistogram_PhredScore.html", title="Histogram of Phred scores") hist_phred.html, hist_phred.fig = plot_overlay_histogram(df, palette, "phredIdentity", hist_phred.title, bins=20, density=True) hist_phred.save(settings) return hist_phred
def spatial_heatmap(array, path, colormap, figformat, title=None): """Taking channel information and creating post run channel activity plots.""" logging.info( "Nanoplotter: Creating heatmap of reads per channel using {} reads.". format(array.size)) activity_map = Plot(path=path + ".html", title="Number of reads generated per channel") layout = make_layout(maxval=np.amax(array)) valueCounts = pd.value_counts(pd.Series(array)) for entry in valueCounts.keys(): layout.template[np.where( layout.structure == entry)] = valueCounts[entry] data = pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks) fig = go.Figure( data=go.Heatmap(z=data.values.tolist(), colorscale=colormap)) fig.update_layout(xaxis_title='Channel', yaxis_title='Number of reads', title=title or activity_map.title, title_x=0.5) activity_map.fig = fig activity_map.html = activity_map.fig.to_html(full_html=False, include_plotlyjs='cdn') activity_map.save(figformat) return [activity_map]
def output_barplot(df, figformat, path, title=None, palette=None): """Create barplots based on number of reads and total sum of nucleotides sequenced.""" logging.info( "NanoComp: Creating barplots for number of reads and total throughput." ) read_count = Plot(path=path + "NanoComp_number_of_reads." + figformat, title="Comparing number of reads") ax = sns.countplot(x="dataset", data=df, palette=palette) ax.set(ylabel='Number of reads', title=title or read_count.title) plt.xticks(rotation=30, ha='center') read_count.fig = ax.get_figure() read_count.save(format=figformat) plt.close("all") throughput_bases = Plot(path=path + "NanoComp_total_throughput." + figformat, title="Comparing throughput in gigabases") if "aligned_lengths" in df: throughput = df.groupby('dataset')['aligned_lengths'].sum() ylabel = 'Total gigabase aligned' else: throughput = df.groupby('dataset')['lengths'].sum() ylabel = 'Total gigabase sequenced' ax = sns.barplot(x=list(throughput.index), y=throughput / 1e9, palette=palette, order=df["dataset"].unique()) ax.set(ylabel=ylabel, title=title or throughput_bases.title) plt.xticks(rotation=30, ha='center') throughput_bases.fig = ax.get_figure() throughput_bases.save(format=figformat) plt.close("all") return read_count, throughput_bases
def quality_over_time(dfs, path, settings, title=None, color="#4CB391"): time_qual = Plot(path=path + "TimeQualityViolinPlot.html", title="Violin plot of quality over time") fig = go.Figure() fig.add_trace( go.Violin(y=dfs["quals"], x=dfs["timebin"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Basecall quality', title=title or time_qual.title, title_x=0.5) fig.update_xaxes(tickangle=45) time_qual.fig = fig time_qual.html = time_qual.fig.to_html(full_html=False, include_plotlyjs='cdn') time_qual.save(settings) return time_qual
def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"): """Taking channel information and creating post run channel activity plots.""" logging.info( "Nanoplotter: Creating heatmap of reads per channel using {} reads.". format(array.size)) activity_map = Plot(path=path + "." + figformat, title="Number of reads generated per channel") layout = make_layout(maxval=np.amax(array)) valueCounts = pd.value_counts(pd.Series(array)) for entry in valueCounts.keys(): layout.template[np.where( layout.structure == entry)] = valueCounts[entry] plt.figure() ax = sns.heatmap(data=pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks), xticklabels="auto", yticklabels="auto", square=True, cbar_kws={"orientation": "horizontal"}, cmap=color, linewidths=0.20) ax.set_title(title or activity_map.title) activity_map.fig = ax.get_figure() activity_map.save(format=figformat) plt.close("all") return [activity_map]
def yield_by_minimal_length_plot(array, name, path, settings, title=None, color="#4CB391"): df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]}) df["cumyield_gb"] = df["lengths"].cumsum() / 10**9 idx = np.random.choice(array.index, min(10000, len(array)), replace=False) yield_by_length = Plot(path=path + "Yield_By_Length.html", title="Yield by length") fig = px.scatter(df, x=df.reindex(idx)["lengths"], y=df.reindex(idx)["cumyield_gb"]) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Read length', yaxis_title='Cumulative yield for minimal length [Gb]', title=title or yield_by_length.title, title_x=0.5) yield_by_length.fig = fig yield_by_length.html = yield_by_length.fig.to_html(full_html=False, include_plotlyjs='cdn') yield_by_length.save(settings) return yield_by_length
def compare_cumulative_yields(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info( "NanoComp: Creating cumulative yield plots using {} reads.".format( len(dfs))) cum_yield_gb = Plot(path=path + "NanoComp_CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") data = [] annotations = [] for sample, color in zip(df["dataset"].unique(), palette): cumsum = dfs.loc[dfs["dataset"] == sample, "lengths"].cumsum().resample('10T').max() / 1e9 data.append( go.Scatter(x=cumsum.index.total_seconds() / 3600, y=cumsum, opacity=0.75, name=sample, marker=dict(color=color))) annotations.append( dict(xref='paper', x=0.99, y=cumsum[-1], xanchor='left', yanchor='middle', text='{}Gb'.format(round(cumsum[-1])), showarrow=False)) cum_yield_gb.html = plotly.offline.plot( { "data": data, "layout": go.Layout(barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), annotations=annotations) }, output_type="div", show_link=False) cum_yield_gb.fig = go.Figure({ "data": data, "layout": go.Layout(barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), annotations=annotations) }) cum_yield_gb.save() return [cum_yield_gb]
def output_barplot(df, path, settings, title=None): """Create barplots based on number of reads and total sum of nucleotides sequenced.""" logging.info( "NanoComp: Creating barplots for number of reads and total throughput." ) read_count = Plot(path=path + "NanoComp_number_of_reads.html", title="Comparing number of reads") read_count.fig = go.Figure() counts = df['dataset'].value_counts(sort=False).sort_index() idx = counts.index for idx, count in zip(idx, counts): read_count.fig.add_trace(go.Bar(x=[idx], y=[count], name=idx)) read_count.fig.update_layout( title_text=title or read_count.title, title_x=0.5, yaxis_title="Number of reads", ) read_count.html = read_count.fig.to_html(full_html=False, include_plotlyjs='cdn') read_count.save(settings) throughput_bases = Plot(path=path + "NanoComp_total_throughput.html", title="Comparing throughput in bases") if "aligned_lengths" in df: throughput = df.groupby('dataset')['aligned_lengths'].sum() ylabel = 'Total bases aligned' else: throughput = df.groupby('dataset')['lengths'].sum() ylabel = 'Total bases sequenced' idx = df["dataset"].unique() throughput_bases.fig = go.Figure() for idx, sum_dataset in zip(idx, throughput): throughput_bases.fig.add_trace( go.Bar(x=[idx], y=[sum_dataset], name=idx)) throughput_bases.fig.update_layout( title=title or throughput_bases.title, title_x=0.5, yaxis_title=ylabel, ) throughput_bases.html = throughput_bases.fig.to_html( full_html=False, include_plotlyjs='cdn') throughput_bases.save(settings) return read_count, throughput_bases
def dynamic_histogram(array, name, path, title=None, color="#4CB391"): """ Use plotly to a histogram Return html code, but also save as png """ dynhist = Plot(path=path + "Dynamic_Histogram_{}.html".format(name.replace(' ', '_')), title=title or "Dynamic histogram of {}".format(name)) dynhist.html, dynhist.fig = plotly_histogram(array=array.sample(min(len(array), 10000)), color=color, title=dynhist.title) dynhist.save() return dynhist
def length_over_time(dfs, path, title, settings, log_length=False, color="#4CB391"): if log_length: time_length = Plot(path=path + "TimeLogLengthViolinPlot.html", title="Violin plot of log read lengths over time") else: time_length = Plot(path=path + "TimeLengthViolinPlot.html", title="Violin plot of read lengths over time") length_column = "log_lengths" if log_length else "lengths" if "length_filter" in dfs: # produced by NanoPlot filtering of too long reads temp_dfs = dfs[dfs["length_filter"]] else: temp_dfs = dfs fig = go.Figure() fig.add_trace( go.Violin(y=temp_dfs[length_column], x=temp_dfs["timebin"], points=False, spanmode="hard", line_color='black', line_width=1.5, fillcolor=color, opacity=0.8)) fig.update_layout(xaxis_title='Interval (hours)', yaxis_title='Read length', title=title or time_length.title, title_x=0.5) if log_length: ticks = [ 10**i for i in range(10) if not 10**i > 10 * np.amax(dfs["lengths"]) ] fig.update_layout(yaxis=dict( tickmode='array', tickvals=np.log10(ticks), ticktext=ticks)) fig.update_yaxes(tickangle=45) time_length.fig = fig time_length.html = time_length.fig.to_html(full_html=False, include_plotlyjs='cdn') time_length.save(settings) return time_length
def overlay_histogram_identity(df, path, settings, palette=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist_pid = Plot(path=path + "NanoComp_OverlayHistogram_Identity.html", title="Histogram of percent reference identity") hist_pid.html, hist_pid.fig = plot_overlay_histogram(df, palette, "percentIdentity", hist_pid.title, density=True) hist_pid.save(settings) return hist_pid
def compare_cumulative_yields(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info( "Nanoplotter: Creating cumulative yield plots using {} reads.".format( len(dfs))) cum_yield_gb = Plot(path=path + "NanoComp_CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") data = [] for d, c in zip(df["dataset"].unique(), palette): s = dfs.loc[dfs["dataset"] == d, "lengths"].cumsum().resample('10T').max() / 1e9 data.append( go.Scatter(x=s.index.total_seconds() / 3600, y=s, opacity=0.75, name=d, marker=dict(color=c))) cum_yield_gb.html = plotly.offline.plot( { "data": data, "layout": go.Layout( barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), ) }, output_type="div", show_link=False) cum_yield_gb.fig = go.Figure({ "data": data, "layout": go.Layout( barmode='overlay', title=title or cum_yield_gb.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Yield (gigabase)"), ) }) cum_yield_gb.save() return [cum_yield_gb]
def active_pores_over_time(df, path, palette=None, title=None): if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") logging.info("NanoComp: Creating active pores plot using {} reads.".format( len(dfs))) active_pores = Plot(path=path + "NanoComp_ActivePoresOverTime.html", title="Active pores over time") data = [] for sample, color in zip(df["dataset"].unique(), palette): pores = dfs.loc[dfs["dataset"] == sample, "channelIDs"].resample('10T').nunique() data.append( go.Scatter(x=pores.index.total_seconds() / 3600, y=pores, opacity=0.75, name=sample, marker=dict(color=color))) active_pores.html = plotly.offline.plot( { "data": data, "layout": go.Layout( barmode='overlay', title=title or active_pores.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Active pores (per 10 minutes)"), ) }, output_type="div", show_link=False) active_pores.fig = go.Figure({ "data": data, "layout": go.Layout( barmode='overlay', title=title or active_pores.title, xaxis=dict(title="Time (hours)"), yaxis=dict(title="Active pores (per 10 minutes)"), ) }) active_pores.save() return active_pores
def dynamic_histogram(array, name, path, figformat, title=None, color="#4CB391"): """ Use plotly to a histogram Return html code, but also save as png """ dynhist = Plot( path=path + f"Dynamic_Histogram_{name[0].lower() + name[1:].replace(' ', '_')}.html", title="Dynamic histogram of {}".format(name[0].lower() + name[1:])) ylabel = "Number of reads" if len(array) <= 10000 else "Downsampled number of reads" dynhist.html, dynhist.fig = plotly_histogram(array=array.sample(min(len(array), 10000)), color=color, title=title or dynhist.title, xlabel=name, ylabel=ylabel) dynhist.save(figformat) return dynhist
def quality_over_time(dfs, path, figformat, title, plot_settings={}): time_qual = Plot(path=path + "TimeQualityViolinPlot." + figformat, title="Violin plot of quality over time") sns.set(style="white", **plot_settings) ax = sns.violinplot(x="timebin", y="quals", data=dfs, inner=None, cut=0, linewidth=0) ax.set(xlabel='Interval (hours)', ylabel="Basecall quality", title=title or time_qual.title) plt.xticks(rotation=45, ha='center', fontsize=8) time_qual.fig = ax.get_figure() time_qual.save(format=figformat) plt.close("all") return time_qual
def plot_over_time(dfs, path, title, figformat, color="#4CB391"): num_reads = Plot(path=path + "NumberOfReads_Over_Time.html", title="Number of reads over time") s = dfs.loc[:, "lengths"].resample('10T').count() fig = px.scatter( data_frame=None, x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Number of reads per 10 minutes', title=title or num_reads.title, title_x=0.5) num_reads.fig = fig num_reads.html = num_reads.fig.to_html(full_html=False, include_plotlyjs='cdn') num_reads.save(figformat) plots = [num_reads] if "channelIDs" in dfs: pores_over_time = Plot(path=path + "ActivePores_Over_Time.html", title="Number of active pores over time") s = dfs.loc[:, "channelIDs"].resample('10T').nunique() fig = px.scatter( data_frame=None, x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Active pores per 10 minutes', title=title or pores_over_time.title, title_x=0.5) pores_over_time.fig = fig pores_over_time.html = pores_over_time.fig.to_html(full_html=False, include_plotlyjs='cdn') pores_over_time.save(figformat) plots.append(pores_over_time) return plots
def sequencing_speed_over_time(dfs, path, figformat, title, plot_settings={}): time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot." + figformat, title="Violin plot of sequencing speed over time") sns.set(style="white", **plot_settings) if "timebin" not in dfs: dfs['timebin'] = add_time_bins(dfs) ax = sns.violinplot(x=dfs["timebin"], y=dfs["lengths"] / dfs["duration"], inner=None, cut=0, linewidth=0) ax.set(xlabel='Interval (hours)', ylabel="Sequencing speed (nucleotides/second)", title=title or time_duration.title) plt.xticks(rotation=45, ha='center', fontsize=8) time_duration.fig = ax.get_figure() time_duration.save(format=figformat) plt.close("all") return time_duration
def overlay_histogram_phred(df, path, figformat, palette=None): df["phredIdentity"] = -10 * np.log10(1 - (df["percentIdentity"] / 100)) if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist_phred = Plot(path=path + "NanoComp_OverlayHistogram_PhredScore.html", title="Histogram of Phred scores") hist_phred.html, hist_phred.fig = plot_overlay_histogram(df, palette, "phredIdentity", hist_phred.title, bins=20, density=True) hist_phred.save(figformat=figformat) return hist_phred
def compare_sequencing_speed(df, figformat, path, title=None, palette=None): logging.info( "Nanoplotter: creating comparison of sequencing speed over time.") seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time." + figformat, title="Sequencing speed over time") dfs = check_valid_time_and_sort(df, "start_time") dfs['timebin'] = add_time_bins(dfs) ax = sns.violinplot(x=dfs["timebin"], y=dfs["lengths"] / dfs["duration"], hue=dfs["dataset"], inner=None, cut=0, linewidth=0) ax.set(xlabel='Interval (hours)', ylabel="Sequencing speed (nucleotides/second)") plt.xticks(rotation=45, ha='center', fontsize=8) seq_speed.fig = ax.get_figure() seq_speed.save(format=figformat) plt.close("all") return [seq_speed]
def cumulative_yield(dfs, path, title, color, figformat): cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases.html", title="Cumulative yield") s = dfs.loc[:, "lengths"].cumsum().resample('10T').max() / 1e9 fig = px.scatter( x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Cumulative yield in gigabase', title=title or cum_yield_gb.title, title_x=0.5) cum_yield_gb.fig = fig cum_yield_gb.html = cum_yield_gb.fig.to_html(full_html=False, include_plotlyjs='cdn') cum_yield_gb.save(figformat) cum_yield_reads = Plot(path=path + "CumulativeYieldPlot_NumberOfReads.html", title="Cumulative yield") s = dfs.loc[:, "lengths"].resample('10T').count().cumsum() fig = px.scatter( x=s.index.total_seconds() / 3600, y=s) fig.update_traces(marker=dict(color=color)) fig.update_layout(xaxis_title='Run time (hours)', yaxis_title='Cumulative yield in number of reads', title=title or cum_yield_gb.title, title_x=0.5) cum_yield_reads.fig = fig cum_yield_reads.html = cum_yield_reads.fig.to_html(full_html=False, include_plotlyjs='cdn') cum_yield_reads.save(figformat) return [cum_yield_gb, cum_yield_reads]
def yield_by_minimal_length_plot(array, name, path, title=None, color="#4CB391", figformat="png"): df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]}) df["cumyield_gb"] = df["lengths"].cumsum() / 10**9 yield_by_length = Plot( path=path + "Yield_By_Length." + figformat, title="Yield by length") ax = sns.regplot( x='lengths', y="cumyield_gb", data=df, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set( xlabel='Read length', ylabel='Cumulative yield for minimal length', title=title or yield_by_length.title) yield_by_length.fig = ax.get_figure() yield_by_length.save(format=figformat) plt.close("all") return yield_by_length
def plot_over_time(dfs, path, figformat, title, color): num_reads = Plot(path=path + "NumberOfReads_Over_Time." + figformat, title="Number of reads over time") s = dfs.loc[:, "lengths"].resample('10T').count() ax = sns.regplot(x=s.index.total_seconds() / 3600, y=s, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set(xlabel='Run time (hours)', ylabel='Number of reads per 10 minutes', title=title or num_reads.title) num_reads.fig = ax.get_figure() num_reads.save(format=figformat) plt.close("all") plots = [num_reads] if "channelIDs" in dfs: pores_over_time = Plot(path=path + "ActivePores_Over_Time." + figformat, title="Number of active pores over time") s = dfs.loc[:, "channelIDs"].resample('10T').nunique() ax = sns.regplot(x=s.index.total_seconds() / 3600, y=s, x_ci=None, fit_reg=False, color=color, scatter_kws={"s": 3}) ax.set(xlabel='Run time (hours)', ylabel='Active pores per 10 minutes', title=title or pores_over_time.title) pores_over_time.fig = ax.get_figure() pores_over_time.save(format=figformat) plt.close("all") plots.append(pores_over_time) return plots
def n50_barplot(df, path, settings, title=None): ''' Returns Plot object and creates figure(format specified)/html containing bar chart of total gb aligned/sequenced read length n50 ''' n50_bar = Plot(path=path + "NanoComp_N50.html", title="Comparing read length N50") if "aligned_lengths" in df: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"])) for d in df["dataset"].unique() ] ylabel = 'Total gigabase aligned' else: n50s = [ get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"])) for d in df["dataset"].unique() ] ylabel = 'Sequenced read length N50' idx = df["dataset"].unique() n50_bar.fig = go.Figure() for idx, n50 in zip(idx, n50s): n50_bar.fig.add_trace(go.Bar(x=[idx], y=[n50], name=idx)) n50_bar.fig.update_layout( title=title or n50_bar.title, title_x=0.5, yaxis_title=ylabel, ) n50_bar.html = n50_bar.fig.to_html(full_html=False, include_plotlyjs='cdn') n50_bar.save(settings) return [n50_bar]
def compare_sequencing_speed(df, path, settings, title=None): logging.info( "NanoComp: creating comparison of sequencing speed over time.") seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time.html", title="Sequencing speed over time") dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time") dfs = dfs.loc[dfs["duration"] > 0] palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 data = [] for sample, color in zip(df["dataset"].unique(), palette): seqspeed = (dfs.loc[dfs["dataset"] == sample, "lengths"] / dfs.loc[dfs["dataset"] == sample, "duration"]).resample('30T').median() data.append( go.Scatter(x=seqspeed.index.total_seconds() / 3600, y=seqspeed, opacity=0.75, name=sample, mode='lines', marker=dict(color=color))) seq_speed.fig = go.Figure({"data": data}) seq_speed.fig.update_layout( title=title or seq_speed.title, title_x=0.5, xaxis_title='Interval (hours)', yaxis_title="Sequencing speed (nucleotides/second)") seq_speed.html = seq_speed.fig.to_html(full_html=False, include_plotlyjs='cdn') seq_speed.save(settings) return [seq_speed]
def overlay_histogram(df, path, settings, palette=None): """ Use plotly to create an overlay of length histograms Return html code, but also save as figure (format specified) Only has 10 colors, which get recycled up to 5 times. """ if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist = Plot(path=path + "NanoComp_OverlayHistogram.html", title="Histogram of read lengths") hist.html, hist.fig = plot_overlay_histogram(df, palette, column='lengths', title=hist.title) hist.save(settings) hist_norm = Plot(path=path + "NanoComp_OverlayHistogram_Normalized.html", title="Normalized histogram of read lengths") hist_norm.html, hist_norm.fig = plot_overlay_histogram( df, palette, column='lengths', title=hist_norm.title, density=True) hist_norm.save(settings) log_hist = Plot(path=path + "NanoComp_OverlayLogHistogram.html", title="Histogram of log transformed read lengths") log_hist.html, log_hist.fig = plot_log_histogram(df, palette, title=log_hist.title) log_hist.save(settings) log_hist_norm = Plot( path=path + "NanoComp_OverlayLogHistogram_Normalized.html", title="Normalized histogram of log transformed read lengths") log_hist_norm.html, log_hist_norm.fig = plot_log_histogram( df, palette, title=log_hist_norm.title, density=True) log_hist_norm.save(settings) return [hist, hist_norm, log_hist, log_hist_norm]
def overlay_histogram(df, path, palette=None): """ Use plotly to create an overlay of length histograms Return html code, but also save as png Only has 10 colors, which get recycled up to 5 times. """ if palette is None: palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5 hist = Plot(path=path + "NanoComp_OverlayHistogram.html", title="Histogram of read lengths") hist.html, hist.fig = plot_overlay_histogram(df, palette, title=hist.title) hist.save() hist_norm = Plot(path=path + "NanoComp_OverlayHistogram_Normalized.html", title="Normalized histogram of read lengths") hist_norm.html, hist_norm.fig = plot_overlay_histogram( df, palette, title=hist_norm.title, histnorm="probability") hist_norm.save() log_hist = Plot(path=path + "NanoComp_OverlayLogHistogram.html", title="Histogram of log transformed read lengths") log_hist.html, log_hist.fig = plot_log_histogram(df, palette, title=log_hist.title) log_hist.save() log_hist_norm = Plot( path=path + "NanoComp_OverlayLogHistogram_Normalized.html", title="Normalized histogram of log transformed read lengths") log_hist_norm.html, log_hist_norm.fig = plot_log_histogram( df, palette, title=log_hist_norm.title, histnorm="probability") log_hist_norm.save() return [hist, hist_norm, log_hist, log_hist_norm]