예제 #1
0
def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"):
    """Taking channel information and creating post run channel activity plots."""
    logging.info(
        "Nanoplotter: Creating heatmap of reads per channel using {} reads.".
        format(array.size))
    activity_map = Plot(path=path + "." + figformat,
                        title="Number of reads generated per channel")
    layout = make_layout(maxval=np.amax(array))
    valueCounts = pd.value_counts(pd.Series(array))
    for entry in valueCounts.keys():
        layout.template[np.where(
            layout.structure == entry)] = valueCounts[entry]
    plt.figure()
    ax = sns.heatmap(data=pd.DataFrame(layout.template,
                                       index=layout.yticks,
                                       columns=layout.xticks),
                     xticklabels="auto",
                     yticklabels="auto",
                     square=True,
                     cbar_kws={"orientation": "horizontal"},
                     cmap=color,
                     linewidths=0.20)
    ax.set_title(title or activity_map.title)
    activity_map.fig = ax.get_figure()
    activity_map.save(format=figformat)
    plt.close("all")
    return [activity_map]
예제 #2
0
def length_over_time(dfs, path, figformat, title, log_length=False, plot_settings={}):
    time_length = Plot(path=path + "TimeLengthViolinPlot." + figformat,
                       title="Violin plot of read lengths over time")
    sns.set(style="white", **plot_settings)
    if log_length:
        length_column = "log_lengths"
    else:
        length_column = "lengths"

    if "length_filter" in dfs:  # produced by NanoPlot filtering of too long reads
        temp_dfs = dfs[dfs["length_filter"]]
    else:
        temp_dfs = dfs

    ax = sns.violinplot(x="timebin",
                        y=length_column,
                        data=temp_dfs,
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Read length",
           title=title or time_length.title)
    if log_length:
        ticks = [10**i for i in range(10) if not 10**i > 10 * np.amax(dfs["lengths"])]
        ax.set(yticks=np.log10(ticks),
               yticklabels=ticks)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_length.fig = ax.get_figure()
    time_length.save(format=figformat)
    plt.close("all")
    return time_length
예제 #3
0
def n50_barplot(df, figformat, path, title=None, palette=None):
    n50_bar = Plot(path=path + "NanoComp_N50." + figformat,
                   title="Comparing read length N50")
    if "aligned_lengths" in df:
        n50s = [
            get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"]))
            for d in df["dataset"].unique()
        ]
        ylabel = 'Total gigabase aligned'
    else:
        n50s = [
            get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"]))
            for d in df["dataset"].unique()
        ]
        ylabel = 'Sequenced read length N50'
    ax = sns.barplot(x=list(df["dataset"].unique()),
                     y=n50s,
                     palette=palette,
                     order=df["dataset"].unique())
    ax.set(ylabel=ylabel, title=title or n50_bar.title)
    plt.xticks(rotation=30, ha='center')
    n50_bar.fig = ax.get_figure()
    n50_bar.save(format=figformat)
    plt.close("all")
    return [n50_bar]
예제 #4
0
def cumulative_yield(dfs, path, figformat, title, color):
    cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases." +
                        figformat,
                        title="Cumulative yield")
    s = dfs.loc[:, "lengths"].cumsum().resample('1T').max() / 1e9
    ax = sns.regplot(x=s.index.total_seconds() / 3600,
                     y=s,
                     x_ci=None,
                     fit_reg=False,
                     color=color,
                     scatter_kws={"s": 3})
    ax.set(xlabel='Run time (hours)',
           ylabel='Cumulative yield in gigabase',
           title=title or cum_yield_gb.title)
    cum_yield_gb.fig = ax.get_figure()
    cum_yield_gb.save(format=figformat)
    plt.close("all")

    cum_yield_reads = Plot(path=path + "CumulativeYieldPlot_NumberOfReads." +
                           figformat,
                           title="Cumulative yield")
    s = dfs.loc[:, "lengths"].resample('10T').count().cumsum()
    ax = sns.regplot(x=s.index.total_seconds() / 3600,
                     y=s,
                     x_ci=None,
                     fit_reg=False,
                     color=color,
                     scatter_kws={"s": 3})
    ax.set(xlabel='Run time (hours)',
           ylabel='Cumulative yield in number of reads',
           title=title or cum_yield_reads.title)
    cum_yield_reads.fig = ax.get_figure()
    cum_yield_reads.save(format=figformat)
    plt.close("all")
    return [cum_yield_gb, cum_yield_reads]
예제 #5
0
def overlay_histogram_phred(df, path, settings, palette=None):
    """
    Reads with a perfect alignment and thus a percentIdentity of 100
    get a phred score of Inf
    Which is not cool
    So these are set to 60, a very high phred score
    """
    df["phredIdentity"] = -10 * np.log10(1 - (df["percentIdentity"] / 100))
    df["phredIdentity"][np.isinf(df["phredIdentity"])] = 60

    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5

    hist_phred = Plot(path=path + "NanoComp_OverlayHistogram_PhredScore.html",
                      title="Histogram of Phred scores")

    hist_phred.html, hist_phred.fig = plot_overlay_histogram(df,
                                                             palette,
                                                             "phredIdentity",
                                                             hist_phred.title,
                                                             bins=20,
                                                             density=True)

    hist_phred.save(settings)

    return hist_phred
예제 #6
0
def output_barplot(df, figformat, path, title=None, palette=None):
    """Create barplots based on number of reads and total sum of nucleotides sequenced."""
    logging.info(
        "NanoComp: Creating barplots for number of reads and total throughput."
    )
    read_count = Plot(path=path + "NanoComp_number_of_reads." + figformat,
                      title="Comparing number of reads")
    ax = sns.countplot(x="dataset", data=df, palette=palette)
    ax.set(ylabel='Number of reads', title=title or read_count.title)
    plt.xticks(rotation=30, ha='center')
    read_count.fig = ax.get_figure()
    read_count.save(format=figformat)
    plt.close("all")

    throughput_bases = Plot(path=path + "NanoComp_total_throughput." +
                            figformat,
                            title="Comparing throughput in gigabases")
    if "aligned_lengths" in df:
        throughput = df.groupby('dataset')['aligned_lengths'].sum()
        ylabel = 'Total gigabase aligned'
    else:
        throughput = df.groupby('dataset')['lengths'].sum()
        ylabel = 'Total gigabase sequenced'
    ax = sns.barplot(x=list(throughput.index),
                     y=throughput / 1e9,
                     palette=palette,
                     order=df["dataset"].unique())
    ax.set(ylabel=ylabel, title=title or throughput_bases.title)
    plt.xticks(rotation=30, ha='center')
    throughput_bases.fig = ax.get_figure()
    throughput_bases.save(format=figformat)
    plt.close("all")
    return read_count, throughput_bases
예제 #7
0
def dynamic_histogram(array, name, path, title=None, color="#4CB391"):
    """
    Use plotly to a histogram
    Return html code, but also save as png
    """
    dynhist = Plot(path=path + "Dynamic_Histogram_{}.html".format(name.replace(' ', '_')),
                   title=title or "Dynamic histogram of {}".format(name))
    dynhist.html, dynhist.fig = plotly_histogram(array=array.sample(min(len(array), 10000)),
                                                 color=color,
                                                 title=dynhist.title)
    dynhist.save()
    return dynhist
예제 #8
0
def overlay_histogram_identity(df, path, settings, palette=None):
    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5
    hist_pid = Plot(path=path + "NanoComp_OverlayHistogram_Identity.html",
                    title="Histogram of percent reference identity")
    hist_pid.html, hist_pid.fig = plot_overlay_histogram(df,
                                                         palette,
                                                         "percentIdentity",
                                                         hist_pid.title,
                                                         density=True)
    hist_pid.save(settings)

    return hist_pid
예제 #9
0
def dynamic_histogram(array, name, path, figformat, title=None, color="#4CB391"):
    """
    Use plotly to a histogram
    Return html code, but also save as png
    """
    dynhist = Plot(
        path=path + f"Dynamic_Histogram_{name[0].lower() + name[1:].replace(' ', '_')}.html",
        title="Dynamic histogram of {}".format(name[0].lower() + name[1:]))
    ylabel = "Number of reads" if len(array) <= 10000 else "Downsampled number of reads"
    dynhist.html, dynhist.fig = plotly_histogram(array=array.sample(min(len(array), 10000)),
                                                 color=color,
                                                 title=title or dynhist.title,
                                                 xlabel=name,
                                                 ylabel=ylabel)
    dynhist.save(figformat)
    return dynhist
예제 #10
0
def quality_over_time(dfs, path, figformat, title, plot_settings={}):
    time_qual = Plot(path=path + "TimeQualityViolinPlot." + figformat,
                     title="Violin plot of quality over time")
    sns.set(style="white", **plot_settings)
    ax = sns.violinplot(x="timebin",
                        y="quals",
                        data=dfs,
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Basecall quality",
           title=title or time_qual.title)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_qual.fig = ax.get_figure()
    time_qual.save(format=figformat)
    plt.close("all")
    return time_qual
예제 #11
0
def overlay_histogram_phred(df, path, figformat, palette=None):
    df["phredIdentity"] = -10 * np.log10(1 - (df["percentIdentity"] / 100))

    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5

    hist_phred = Plot(path=path + "NanoComp_OverlayHistogram_PhredScore.html",
                      title="Histogram of Phred scores")

    hist_phred.html, hist_phred.fig = plot_overlay_histogram(df,
                                                             palette,
                                                             "phredIdentity",
                                                             hist_phred.title,
                                                             bins=20,
                                                             density=True)

    hist_phred.save(figformat=figformat)

    return hist_phred
예제 #12
0
def sequencing_speed_over_time(dfs, path, figformat, title, plot_settings={}):
    time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot." + figformat,
                         title="Violin plot of sequencing speed over time")
    sns.set(style="white", **plot_settings)
    if "timebin" not in dfs:
        dfs['timebin'] = add_time_bins(dfs)
    ax = sns.violinplot(x=dfs["timebin"],
                        y=dfs["lengths"] / dfs["duration"],
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Sequencing speed (nucleotides/second)",
           title=title or time_duration.title)
    plt.xticks(rotation=45, ha='center', fontsize=8)
    time_duration.fig = ax.get_figure()
    time_duration.save(format=figformat)
    plt.close("all")
    return time_duration
예제 #13
0
def length_over_time(dfs,
                     path,
                     title,
                     settings,
                     log_length=False,
                     color="#4CB391"):
    if log_length:
        time_length = Plot(path=path + "TimeLogLengthViolinPlot.html",
                           title="Violin plot of log read lengths over time")
    else:
        time_length = Plot(path=path + "TimeLengthViolinPlot.html",
                           title="Violin plot of read lengths over time")

    length_column = "log_lengths" if log_length else "lengths"

    if "length_filter" in dfs:  # produced by NanoPlot filtering of too long reads
        temp_dfs = dfs[dfs["length_filter"]]
    else:
        temp_dfs = dfs

    fig = go.Figure()

    fig.add_trace(
        go.Violin(y=temp_dfs[length_column],
                  x=temp_dfs["timebin"],
                  points=False,
                  spanmode="hard",
                  line_color='black',
                  line_width=1.5,
                  fillcolor=color,
                  opacity=0.8))
    fig.update_layout(xaxis_title='Interval (hours)',
                      yaxis_title='Read length',
                      title=title or time_length.title,
                      title_x=0.5)

    if log_length:
        ticks = [
            10**i for i in range(10)
            if not 10**i > 10 * np.amax(dfs["lengths"])
        ]
        fig.update_layout(yaxis=dict(
            tickmode='array', tickvals=np.log10(ticks), ticktext=ticks))

    fig.update_yaxes(tickangle=45)

    time_length.fig = fig
    time_length.html = time_length.fig.to_html(full_html=False,
                                               include_plotlyjs='cdn')
    time_length.save(settings)

    return time_length
예제 #14
0
def compare_sequencing_speed(df, figformat, path, title=None, palette=None):
    logging.info(
        "Nanoplotter: creating comparison of sequencing speed over time.")
    seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time." +
                     figformat,
                     title="Sequencing speed over time")
    dfs = check_valid_time_and_sort(df, "start_time")
    dfs['timebin'] = add_time_bins(dfs)
    ax = sns.violinplot(x=dfs["timebin"],
                        y=dfs["lengths"] / dfs["duration"],
                        hue=dfs["dataset"],
                        inner=None,
                        cut=0,
                        linewidth=0)
    ax.set(xlabel='Interval (hours)',
           ylabel="Sequencing speed (nucleotides/second)")
    plt.xticks(rotation=45, ha='center', fontsize=8)
    seq_speed.fig = ax.get_figure()
    seq_speed.save(format=figformat)
    plt.close("all")
    return [seq_speed]
예제 #15
0
def sequencing_speed_over_time(dfs, path, title, settings, color="#4CB391"):
    time_duration = Plot(path=path + "TimeSequencingSpeed_ViolinPlot.html",
                         title="Violin plot of sequencing speed over time")

    mask = dfs['duration'] != 0

    fig = go.Figure()

    fig.add_trace(
        go.Violin(x=dfs.loc[mask, "timebin"],
                  y=dfs.loc[mask, "lengths"] / dfs.loc[mask, "duration"],
                  points=False,
                  spanmode="hard",
                  line_color='black',
                  line_width=1.5,
                  fillcolor=color,
                  opacity=0.8))

    fig.update_layout(xaxis_title='Interval (hours)',
                      yaxis_title='Sequencing speed (nucleotides/second)',
                      title=title or time_duration.title,
                      title_x=0.5)

    fig.update_xaxes(tickangle=45)

    time_duration.fig = fig
    time_duration.html = time_duration.fig.to_html(full_html=False,
                                                   include_plotlyjs='cdn')
    time_duration.save(settings)

    return time_duration
예제 #16
0
def quality_over_time(dfs, path, settings, title=None, color="#4CB391"):
    time_qual = Plot(path=path + "TimeQualityViolinPlot.html",
                     title="Violin plot of quality over time")

    fig = go.Figure()

    fig.add_trace(
        go.Violin(y=dfs["quals"],
                  x=dfs["timebin"],
                  points=False,
                  spanmode="hard",
                  line_color='black',
                  line_width=1.5,
                  fillcolor=color,
                  opacity=0.8))

    fig.update_layout(xaxis_title='Interval (hours)',
                      yaxis_title='Basecall quality',
                      title=title or time_qual.title,
                      title_x=0.5)

    fig.update_xaxes(tickangle=45)

    time_qual.fig = fig
    time_qual.html = time_qual.fig.to_html(full_html=False,
                                           include_plotlyjs='cdn')
    time_qual.save(settings)

    return time_qual
예제 #17
0
def spatial_heatmap(array, path, colormap, figformat, title=None):
    """Taking channel information and creating post run channel activity plots."""
    logging.info(
        "Nanoplotter: Creating heatmap of reads per channel using {} reads.".
        format(array.size))

    activity_map = Plot(path=path + ".html",
                        title="Number of reads generated per channel")

    layout = make_layout(maxval=np.amax(array))
    valueCounts = pd.value_counts(pd.Series(array))

    for entry in valueCounts.keys():
        layout.template[np.where(
            layout.structure == entry)] = valueCounts[entry]

    data = pd.DataFrame(layout.template,
                        index=layout.yticks,
                        columns=layout.xticks)

    fig = go.Figure(
        data=go.Heatmap(z=data.values.tolist(), colorscale=colormap))
    fig.update_layout(xaxis_title='Channel',
                      yaxis_title='Number of reads',
                      title=title or activity_map.title,
                      title_x=0.5)

    activity_map.fig = fig
    activity_map.html = activity_map.fig.to_html(full_html=False,
                                                 include_plotlyjs='cdn')
    activity_map.save(figformat)
    return [activity_map]
예제 #18
0
def yield_by_minimal_length_plot(array,
                                 name,
                                 path,
                                 settings,
                                 title=None,
                                 color="#4CB391"):
    df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]})
    df["cumyield_gb"] = df["lengths"].cumsum() / 10**9
    idx = np.random.choice(array.index, min(10000, len(array)), replace=False)

    yield_by_length = Plot(path=path + "Yield_By_Length.html",
                           title="Yield by length")

    fig = px.scatter(df,
                     x=df.reindex(idx)["lengths"],
                     y=df.reindex(idx)["cumyield_gb"])
    fig.update_traces(marker=dict(color=color))
    fig.update_layout(xaxis_title='Read length',
                      yaxis_title='Cumulative yield for minimal length [Gb]',
                      title=title or yield_by_length.title,
                      title_x=0.5)

    yield_by_length.fig = fig
    yield_by_length.html = yield_by_length.fig.to_html(full_html=False,
                                                       include_plotlyjs='cdn')
    yield_by_length.save(settings)

    return yield_by_length
예제 #19
0
def yield_by_minimal_length_plot(array, name, path,
                                 title=None, color="#4CB391", figformat="png"):
    df = pd.DataFrame(data={"lengths": np.sort(array)[::-1]})
    df["cumyield_gb"] = df["lengths"].cumsum() / 10**9
    yield_by_length = Plot(
        path=path + "Yield_By_Length." + figformat,
        title="Yield by length")
    ax = sns.regplot(
        x='lengths',
        y="cumyield_gb",
        data=df,
        x_ci=None,
        fit_reg=False,
        color=color,
        scatter_kws={"s": 3})
    ax.set(
        xlabel='Read length',
        ylabel='Cumulative yield for minimal length',
        title=title or yield_by_length.title)
    yield_by_length.fig = ax.get_figure()
    yield_by_length.save(format=figformat)
    plt.close("all")
    return yield_by_length
예제 #20
0
def compare_cumulative_yields(df, path, palette=None, title=None):
    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5
    dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time")

    logging.info(
        "NanoComp: Creating cumulative yield plots using {} reads.".format(
            len(dfs)))
    cum_yield_gb = Plot(path=path +
                        "NanoComp_CumulativeYieldPlot_Gigabases.html",
                        title="Cumulative yield")
    data = []
    annotations = []
    for sample, color in zip(df["dataset"].unique(), palette):
        cumsum = dfs.loc[dfs["dataset"] == sample,
                         "lengths"].cumsum().resample('10T').max() / 1e9
        data.append(
            go.Scatter(x=cumsum.index.total_seconds() / 3600,
                       y=cumsum,
                       opacity=0.75,
                       name=sample,
                       marker=dict(color=color)))
        annotations.append(
            dict(xref='paper',
                 x=0.99,
                 y=cumsum[-1],
                 xanchor='left',
                 yanchor='middle',
                 text='{}Gb'.format(round(cumsum[-1])),
                 showarrow=False))

    cum_yield_gb.html = plotly.offline.plot(
        {
            "data":
            data,
            "layout":
            go.Layout(barmode='overlay',
                      title=title or cum_yield_gb.title,
                      xaxis=dict(title="Time (hours)"),
                      yaxis=dict(title="Yield (gigabase)"),
                      annotations=annotations)
        },
        output_type="div",
        show_link=False)

    cum_yield_gb.fig = go.Figure({
        "data":
        data,
        "layout":
        go.Layout(barmode='overlay',
                  title=title or cum_yield_gb.title,
                  xaxis=dict(title="Time (hours)"),
                  yaxis=dict(title="Yield (gigabase)"),
                  annotations=annotations)
    })
    cum_yield_gb.save()
    return [cum_yield_gb]
예제 #21
0
def plot_over_time(dfs, path, figformat, title, color):
    num_reads = Plot(path=path + "NumberOfReads_Over_Time." + figformat,
                     title="Number of reads over time")
    s = dfs.loc[:, "lengths"].resample('10T').count()
    ax = sns.regplot(x=s.index.total_seconds() / 3600,
                     y=s,
                     x_ci=None,
                     fit_reg=False,
                     color=color,
                     scatter_kws={"s": 3})
    ax.set(xlabel='Run time (hours)',
           ylabel='Number of reads per 10 minutes',
           title=title or num_reads.title)
    num_reads.fig = ax.get_figure()
    num_reads.save(format=figformat)
    plt.close("all")
    plots = [num_reads]

    if "channelIDs" in dfs:
        pores_over_time = Plot(path=path + "ActivePores_Over_Time." +
                               figformat,
                               title="Number of active pores over time")
        s = dfs.loc[:, "channelIDs"].resample('10T').nunique()
        ax = sns.regplot(x=s.index.total_seconds() / 3600,
                         y=s,
                         x_ci=None,
                         fit_reg=False,
                         color=color,
                         scatter_kws={"s": 3})
        ax.set(xlabel='Run time (hours)',
               ylabel='Active pores per 10 minutes',
               title=title or pores_over_time.title)
        pores_over_time.fig = ax.get_figure()
        pores_over_time.save(format=figformat)
        plt.close("all")
        plots.append(pores_over_time)
    return plots
예제 #22
0
def compare_cumulative_yields(df, path, palette=None, title=None):
    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5
    dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time")

    logging.info(
        "Nanoplotter: Creating cumulative yield plots using {} reads.".format(
            len(dfs)))
    cum_yield_gb = Plot(path=path +
                        "NanoComp_CumulativeYieldPlot_Gigabases.html",
                        title="Cumulative yield")
    data = []
    for d, c in zip(df["dataset"].unique(), palette):
        s = dfs.loc[dfs["dataset"] == d,
                    "lengths"].cumsum().resample('10T').max() / 1e9
        data.append(
            go.Scatter(x=s.index.total_seconds() / 3600,
                       y=s,
                       opacity=0.75,
                       name=d,
                       marker=dict(color=c)))
    cum_yield_gb.html = plotly.offline.plot(
        {
            "data":
            data,
            "layout":
            go.Layout(
                barmode='overlay',
                title=title or cum_yield_gb.title,
                xaxis=dict(title="Time (hours)"),
                yaxis=dict(title="Yield (gigabase)"),
            )
        },
        output_type="div",
        show_link=False)

    cum_yield_gb.fig = go.Figure({
        "data":
        data,
        "layout":
        go.Layout(
            barmode='overlay',
            title=title or cum_yield_gb.title,
            xaxis=dict(title="Time (hours)"),
            yaxis=dict(title="Yield (gigabase)"),
        )
    })
    cum_yield_gb.save()
    return [cum_yield_gb]
예제 #23
0
def active_pores_over_time(df, path, palette=None, title=None):
    if palette is None:
        palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5
    dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time")

    logging.info("NanoComp: Creating active pores plot using {} reads.".format(
        len(dfs)))
    active_pores = Plot(path=path + "NanoComp_ActivePoresOverTime.html",
                        title="Active pores over time")
    data = []
    for sample, color in zip(df["dataset"].unique(), palette):
        pores = dfs.loc[dfs["dataset"] == sample,
                        "channelIDs"].resample('10T').nunique()
        data.append(
            go.Scatter(x=pores.index.total_seconds() / 3600,
                       y=pores,
                       opacity=0.75,
                       name=sample,
                       marker=dict(color=color)))

    active_pores.html = plotly.offline.plot(
        {
            "data":
            data,
            "layout":
            go.Layout(
                barmode='overlay',
                title=title or active_pores.title,
                xaxis=dict(title="Time (hours)"),
                yaxis=dict(title="Active pores (per 10 minutes)"),
            )
        },
        output_type="div",
        show_link=False)

    active_pores.fig = go.Figure({
        "data":
        data,
        "layout":
        go.Layout(
            barmode='overlay',
            title=title or active_pores.title,
            xaxis=dict(title="Time (hours)"),
            yaxis=dict(title="Active pores (per 10 minutes)"),
        )
    })
    active_pores.save()
    return active_pores
예제 #24
0
def n50_barplot(df, path, settings, title=None):
    '''
    Returns Plot object and creates figure(format specified)/html
    containing bar chart of total gb aligned/sequenced read length n50
    '''
    n50_bar = Plot(path=path + "NanoComp_N50.html",
                   title="Comparing read length N50")
    if "aligned_lengths" in df:
        n50s = [
            get_N50(np.sort(df.loc[df["dataset"] == d, "aligned_lengths"]))
            for d in df["dataset"].unique()
        ]
        ylabel = 'Total gigabase aligned'
    else:
        n50s = [
            get_N50(np.sort(df.loc[df["dataset"] == d, "lengths"]))
            for d in df["dataset"].unique()
        ]
        ylabel = 'Sequenced read length N50'

    idx = df["dataset"].unique()

    n50_bar.fig = go.Figure()

    for idx, n50 in zip(idx, n50s):
        n50_bar.fig.add_trace(go.Bar(x=[idx], y=[n50], name=idx))

    n50_bar.fig.update_layout(
        title=title or n50_bar.title,
        title_x=0.5,
        yaxis_title=ylabel,
    )

    n50_bar.html = n50_bar.fig.to_html(full_html=False, include_plotlyjs='cdn')
    n50_bar.save(settings)
    return [n50_bar]
예제 #25
0
def compare_sequencing_speed(df, path, settings, title=None):
    logging.info(
        "NanoComp: creating comparison of sequencing speed over time.")
    seq_speed = Plot(path=path + "NanoComp_sequencing_speed_over_time.html",
                     title="Sequencing speed over time")

    dfs = check_valid_time_and_sort(df, "start_time").set_index("start_time")
    dfs = dfs.loc[dfs["duration"] > 0]

    palette = plotly.colors.DEFAULT_PLOTLY_COLORS * 5

    data = []
    for sample, color in zip(df["dataset"].unique(), palette):
        seqspeed = (dfs.loc[dfs["dataset"] == sample, "lengths"] /
                    dfs.loc[dfs["dataset"] == sample,
                            "duration"]).resample('30T').median()
        data.append(
            go.Scatter(x=seqspeed.index.total_seconds() / 3600,
                       y=seqspeed,
                       opacity=0.75,
                       name=sample,
                       mode='lines',
                       marker=dict(color=color)))

    seq_speed.fig = go.Figure({"data": data})

    seq_speed.fig.update_layout(
        title=title or seq_speed.title,
        title_x=0.5,
        xaxis_title='Interval (hours)',
        yaxis_title="Sequencing speed (nucleotides/second)")

    seq_speed.html = seq_speed.fig.to_html(full_html=False,
                                           include_plotlyjs='cdn')
    seq_speed.save(settings)
    return [seq_speed]
예제 #26
0
def scatter(x, y, names, path, plots, color="#4CB391", figformat="png",
            stat=None, log=False, minvalx=0, minvaly=0, title=None,
            plot_settings={}, xmax=None, ymax=None):
    """Create bivariate plots.

    Create four types of bivariate plots of x vs y, containing marginal summaries
    -A scatter plot with histograms on axes
    -A hexagonal binned plot with histograms on axes
    -A kernel density plot with density curves on axes
    -A pauvre-style plot using code from https://github.com/conchoecia/pauvre
    """
    logging.info("Nanoplotter: Creating {} vs {} plots using statistics from {} reads.".format(
        names[0], names[1], x.size))
    if not contains_variance([x, y], names):
        return []
    sns.set(style="ticks", **plot_settings)
    maxvalx = xmax or np.amax(x)
    maxvaly = ymax or np.amax(y)

    plots_made = []

    if plots["hex"]:
        hex_plot = Plot(
            path=path + "_hex." + figformat,
            title="{} vs {} plot using hexagonal bins".format(names[0], names[1]))
        plot = sns.jointplot(
            x=x,
            y=y,
            kind="hex",
            color=color,
            stat_func=stat,
            space=0,
            xlim=(minvalx, maxvalx),
            ylim=(minvaly, maxvaly),
            height=10)
        plot.set_axis_labels(names[0], names[1])
        if log:
            hex_plot.title = hex_plot.title + " after log transformation of read lengths"
            ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)]
            plot.ax_joint.set_xticks(np.log10(ticks))
            plot.ax_marg_x.set_xticks(np.log10(ticks))
            plot.ax_joint.set_xticklabels(ticks)
        plt.subplots_adjust(top=0.90)
        plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25)
        hex_plot.fig = plot
        hex_plot.save(format=figformat)
        plots_made.append(hex_plot)

    sns.set(style="darkgrid", **plot_settings)
    if plots["dot"]:
        dot_plot = Plot(
            path=path + "_dot." + figformat,
            title="{} vs {} plot using dots".format(names[0], names[1]))
        plot = sns.jointplot(
            x=x,
            y=y,
            kind="scatter",
            color=color,
            stat_func=stat,
            xlim=(minvalx, maxvalx),
            ylim=(minvaly, maxvaly),
            space=0,
            height=10,
            joint_kws={"s": 1})
        plot.set_axis_labels(names[0], names[1])
        if log:
            dot_plot.title = dot_plot.title + " after log transformation of read lengths"
            ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)]
            plot.ax_joint.set_xticks(np.log10(ticks))
            plot.ax_marg_x.set_xticks(np.log10(ticks))
            plot.ax_joint.set_xticklabels(ticks)
        plt.subplots_adjust(top=0.90)
        plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25)
        dot_plot.fig = plot
        dot_plot.save(format=figformat)
        plots_made.append(dot_plot)

    if plots["kde"]:
        idx = np.random.choice(x.index, min(2000, len(x)), replace=False)
        kde_plot = Plot(
            path=path + "_kde." + figformat,
            title="{} vs {} plot using a kernel density estimation".format(names[0], names[1]))
        plot = sns.jointplot(
            x=x[idx],
            y=y[idx],
            kind="kde",
            clip=((0, np.Inf), (0, np.Inf)),
            xlim=(minvalx, maxvalx),
            ylim=(minvaly, maxvaly),
            space=0,
            color=color,
            stat_func=stat,
            shade_lowest=False,
            height=10)
        plot.set_axis_labels(names[0], names[1])
        if log:
            kde_plot.title = kde_plot.title + " after log transformation of read lengths"
            ticks = [10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)]
            plot.ax_joint.set_xticks(np.log10(ticks))
            plot.ax_marg_x.set_xticks(np.log10(ticks))
            plot.ax_joint.set_xticklabels(ticks)
        plt.subplots_adjust(top=0.90)
        plot.fig.suptitle(title or "{} vs {} plot".format(names[0], names[1]), fontsize=25)
        kde_plot.fig = plot
        kde_plot.save(format=figformat)
        plots_made.append(kde_plot)

    if plots["pauvre"] and names == ['Read lengths', 'Average read quality'] and log is False:
        pauvre_plot = Plot(
            path=path + "_pauvre." + figformat,
            title="{} vs {} plot using pauvre-style @conchoecia".format(names[0], names[1]))
        sns.set(style="white", **plot_settings)
        margin_plot(df=pd.DataFrame({"length": x, "meanQual": y}),
                    Y_AXES=False,
                    title=title or "Length vs Quality in Pauvre-style",
                    plot_maxlen=None,
                    plot_minlen=0,
                    plot_maxqual=None,
                    plot_minqual=0,
                    lengthbin=None,
                    qualbin=None,
                    BASENAME="whatever",
                    path=pauvre_plot.path,
                    fileform=[figformat],
                    dpi=600,
                    TRANSPARENT=True,
                    QUIET=True)
        plots_made.append(pauvre_plot)
    plt.close("all")
    return plots_made
예제 #27
0
def scatter(x,
            y,
            legacy,
            names,
            path,
            plots,
            color,
            colormap,
            settings,
            stat=None,
            log=False,
            minvalx=0,
            minvaly=0,
            title=None,
            xmax=None,
            ymax=None):
    """->
    create marginalised scatterplots and KDE plot with marginalized histograms
    -> update from scatter_legacy function to utilise plotly package
    - scatterplot with histogram on both axes
    - kernel density plot with histograms on both axes
    - hexbin not implemented yet
    - pauvre plot temporarily not available
    """
    logging.info(
        f"NanoPlot: Creating {names[0]} vs {names[1]} plots using {x.size} reads."
    )
    if not contains_variance([x, y], names):
        return []
    plots_made = []
    idx = np.random.choice(x.index, min(10000, len(x)), replace=False)
    maxvalx = xmax or np.amax(x[idx])
    maxvaly = ymax or np.amax(y[idx])

    if plots["dot"]:
        if log:
            dot_plot = Plot(path=path + "_loglength_dot.html",
                            title=f"{names[0]} vs {names[1]} plot using dots "
                            "after log transformation of read lengths")
        else:
            dot_plot = Plot(path=path + "_dot.html",
                            title=f"{names[0]} vs {names[1]} plot using dots")

        fig = px.scatter(x=x[idx],
                         y=y[idx],
                         marginal_x="histogram",
                         marginal_y="histogram",
                         range_x=[minvalx, maxvalx],
                         range_y=[minvaly, maxvaly])
        fig.update_traces(marker=dict(color=color))
        fig.update_yaxes(rangemode="tozero")
        fig.update_xaxes(rangemode="tozero")

        fig.update_layout(xaxis_title=names[0],
                          yaxis_title=names[1],
                          title=title or dot_plot.title,
                          title_x=0.5)

        if log:
            ticks = [
                10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)
            ]
            fig.update_layout(xaxis=dict(tickmode='array',
                                         tickvals=np.log10(ticks),
                                         ticktext=ticks,
                                         tickangle=45))

        dot_plot.fig = fig
        dot_plot.html = dot_plot.fig.to_html(full_html=False,
                                             include_plotlyjs='cdn')
        dot_plot.save(settings)
        plots_made.append(dot_plot)

    if plots["kde"]:
        kde_plot = Plot(path=path + "_loglength_kde.html" if log else path +
                        "_kde.html",
                        title=f"{names[0]} vs {names[1]} kde plot")

        col = hex_to_rgb_scale_0_1(color)
        fig = ff.create_2d_density(x[idx],
                                   y[idx],
                                   point_size=3,
                                   hist_color=col,
                                   point_color=col,
                                   colorscale=colormap)

        fig.update_layout(xaxis_title=names[0],
                          yaxis_title=names[1],
                          title=title or kde_plot.title,
                          title_x=0.5,
                          xaxis=dict(tickangle=45))

        if log:
            ticks = [
                10**i for i in range(10) if not 10**i > 10 * (10**maxvalx)
            ]
            fig.update_layout(xaxis=dict(tickmode='array',
                                         tickvals=np.log10(ticks),
                                         ticktext=ticks,
                                         tickangle=45))

        kde_plot.fig = fig
        kde_plot.html = kde_plot.fig.to_html(full_html=False,
                                             include_plotlyjs='cdn')
        kde_plot.save(settings)
        plots_made.append(kde_plot)

    if 1 in legacy.values():
        settings, args = utils.get_args()
        plots_made += scatter_legacy(x=x[idx],
                                     y=y[idx],
                                     names=names,
                                     path=path,
                                     plots=legacy,
                                     color=color,
                                     settings=settings,
                                     stat=stat,
                                     log=log,
                                     minvalx=minvalx,
                                     minvaly=minvaly,
                                     title=title)
    return plots_made
예제 #28
0
def plot_over_time(dfs, path, title, settings, color="#4CB391"):
    num_reads = Plot(path=path + "NumberOfReads_Over_Time.html",
                     title="Number of reads over time")
    s = dfs.loc[:, "lengths"].resample('10T').count()

    fig = px.scatter(data_frame=None, x=s.index.total_seconds() / 3600, y=s)
    fig.update_traces(marker=dict(color=color))

    fig.update_layout(xaxis_title='Run time (hours)',
                      yaxis_title='Number of reads per 10 minutes',
                      title=title or num_reads.title,
                      title_x=0.5)

    num_reads.fig = fig
    num_reads.html = num_reads.fig.to_html(full_html=False,
                                           include_plotlyjs='cdn')
    num_reads.save(settings)

    plots = [num_reads]

    if "channelIDs" in dfs:
        pores_over_time = Plot(path=path + "ActivePores_Over_Time.html",
                               title="Number of active pores over time")
        s = dfs.loc[:, "channelIDs"].resample('10T').nunique()

        fig = px.scatter(data_frame=None,
                         x=s.index.total_seconds() / 3600,
                         y=s)
        fig.update_traces(marker=dict(color=color))

        fig.update_layout(xaxis_title='Run time (hours)',
                          yaxis_title='Active pores per 10 minutes',
                          title=title or pores_over_time.title,
                          title_x=0.5)

        pores_over_time.fig = fig
        pores_over_time.html = pores_over_time.fig.to_html(
            full_html=False, include_plotlyjs='cdn')
        pores_over_time.save(settings)

        plots.append(pores_over_time)
    return plots
예제 #29
0
def cumulative_yield(dfs, path, title, color, settings):
    cum_yield_gb = Plot(path=path + "CumulativeYieldPlot_Gigabases.html",
                        title="Cumulative yield")

    s = dfs.loc[:, "lengths"].cumsum().resample('10T').max() / 1e9

    fig = px.scatter(x=s.index.total_seconds() / 3600, y=s)
    fig.update_traces(marker=dict(color=color))

    fig.update_layout(xaxis_title='Run time (hours)',
                      yaxis_title='Cumulative yield in gigabase',
                      title=title or cum_yield_gb.title,
                      title_x=0.5)

    cum_yield_gb.fig = fig
    cum_yield_gb.html = cum_yield_gb.fig.to_html(full_html=False,
                                                 include_plotlyjs='cdn')
    cum_yield_gb.save(settings)

    cum_yield_reads = Plot(path=path +
                           "CumulativeYieldPlot_NumberOfReads.html",
                           title="Cumulative yield")

    s = dfs.loc[:, "lengths"].resample('10T').count().cumsum()

    fig = px.scatter(x=s.index.total_seconds() / 3600, y=s)
    fig.update_traces(marker=dict(color=color))

    fig.update_layout(xaxis_title='Run time (hours)',
                      yaxis_title='Cumulative yield in number of reads',
                      title=title or cum_yield_gb.title,
                      title_x=0.5)

    cum_yield_reads.fig = fig
    cum_yield_reads.html = cum_yield_reads.fig.to_html(full_html=False,
                                                       include_plotlyjs='cdn')
    cum_yield_reads.save(settings)

    return [cum_yield_gb, cum_yield_reads]
예제 #30
0
def length_plots(array, name, path, title=None, n50=None, color="#4CB391", figformat="png"):
    """Create histogram of normal and log transformed read lengths."""
    logging.info("Nanoplotter: Creating length plots for {}.".format(name))
    maxvalx = np.amax(array)
    if n50:
        logging.info("Nanoplotter: Using {} reads with read length N50 of {}bp and maximum of {}bp."
                     .format(array.size, n50, maxvalx))
    else:
        logging.info("Nanoplotter: Using {} reads maximum of {}bp.".format(array.size, maxvalx))

    plots = []
    HistType = namedtuple('HistType', 'weight name ylabel')
    for h_type in [HistType(None, "", "Number of reads"),
                   HistType(array, "Weighted ", "Number of bases")]:
        histogram = Plot(
            path=path + h_type.name.replace(" ", "_") + "Histogram"
            + name.replace(' ', '') + "." + figformat,
            title=h_type.name + "Histogram of read lengths")
        ax = sns.distplot(
            a=array,
            kde=False,
            hist=True,
            bins=max(round(int(maxvalx) / 500), 10),
            color=color,
            hist_kws=dict(weights=h_type.weight,
                          edgecolor=color,
                          linewidth=0.2,
                          alpha=0.8))
        if n50:
            plt.axvline(n50)
            plt.annotate('N50', xy=(n50, np.amax([h.get_height() for h in ax.patches])), size=8)
        ax.set(
            xlabel='Read length',
            ylabel=h_type.ylabel,
            title=title or histogram.title)
        plt.ticklabel_format(style='plain', axis='y')
        histogram.fig = ax.get_figure()
        histogram.save(format=figformat)
        plt.close("all")

        log_histogram = Plot(
            path=path + h_type.name.replace(" ", "_") + "LogTransformed_Histogram"
            + name.replace(' ', '') + "." + figformat,
            title=h_type.name + "Histogram of read lengths after log transformation")
        ax = sns.distplot(
            a=np.log10(array),
            kde=False,
            hist=True,
            color=color,
            hist_kws=dict(weights=h_type.weight,
                          edgecolor=color,
                          linewidth=0.2,
                          alpha=0.8))
        ticks = [10**i for i in range(10) if not 10**i > 10 * maxvalx]
        ax.set(
            xticks=np.log10(ticks),
            xticklabels=ticks,
            xlabel='Read length',
            ylabel=h_type.ylabel,
            title=title or log_histogram.title)
        if n50:
            plt.axvline(np.log10(n50))
            plt.annotate('N50', xy=(np.log10(n50), np.amax(
                [h.get_height() for h in ax.patches])), size=8)
        plt.ticklabel_format(style='plain', axis='y')
        log_histogram.fig = ax.get_figure()
        log_histogram.save(format=figformat)
        plt.close("all")
        plots.extend([histogram, log_histogram])
    plots.append(yield_by_minimal_length_plot(array=array,
                                              name=name,
                                              path=path,
                                              title=title,
                                              color=color,
                                              figformat=figformat))
    return plots