Exemplo n.º 1
0
def plot_cumulative_histogram(data_folder, adaID, fragment, insert_sizes,
                              title=None,
                              ax=None,
                              show=False, savefig=False,
                              **kwargs):
    '''Plot cumulative histogram of insert sizes'''
    import matplotlib.pyplot as plt
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.plot(insert_sizes, np.linspace(0, 1, len(insert_sizes)), **kwargs)
    ax.set_xlabel('Insert size')
    ax.set_ylabel('Cumulative fraction')
    ax.set_xlim(-1, 1000)
    ax.set_ylim(-0.02, 1.02)
    if title is not None:
        ax.set_title(title)

    plt.tight_layout()

    if show:
        plt.ion()
        plt.show()

    if savefig:
        output_filename = get_insert_size_distribution_cumulative_filename(data_folder,
                                                                           adaID,
                                                                           fragment)
        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder
        mkdirs(get_figure_folder(data_folder, adaID))
        fig.savefig(output_filename)
def plot_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False):
    """Plot the results of the quality scores along reads"""

    import matplotlib.pyplot as plt
    from matplotlib import cm

    fig, axs = plt.subplots(1, 2, figsize=(16, 9))
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qpos in enumerate(qual):
            x = qpos
            y = np.linspace(0, 1, len(x))[::-1]
            ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qual))), alpha=0.5, lw=2)
        ax.set_xlabel("Phred quality", fontsize=14)
        ax.set_ylabel("Fraction of bases above quality x", fontsize=14)
        ax.set_title("Read" + str(i + 1), fontsize=16)
        ax.text(2, 0.03, "blue to red: 0 to " + str(len(qual)) + " base", fontsize=18)

    fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder, get_quality_along_reads_filename

        fig_folder = get_figure_folder(data_folder, adaID)
        fig_filename = get_quality_along_reads_filename(data_folder, adaID)
        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
Exemplo n.º 3
0
def plot_histogram(data_folder, adaID, fragment, h,
                   title=None,
                   ax=None,
                   show=False, savefig=False,
                   **kwargs):
    '''Plot histogram of insert sizes'''
    import matplotlib.pyplot as plt
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    if title is not None:
        ax.set_title(title)
    x = 0.5 * (h[1][1:] + h[1][:-1])
    y = h[0]
    ax.plot(x, y, **kwargs)
    ax.set_xlabel('Insert size')
    ax.set_ylabel('Density')

    plt.tight_layout()

    if show:
        plt.ion()
        plt.show()

    if savefig:
        output_filename = get_insert_size_distribution_filename(data_folder, adaID,
                                                                fragment)

        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder
        mkdirs(get_figure_folder(data_folder, adaID))
        plt.savefig(output_filename)
def plot_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False):
    '''Plot the results of the quality scores along reads'''

    import matplotlib.pyplot as plt
    from matplotlib import cm
    fig, axs = plt.subplots(1, 2, figsize=(16, 9))
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qpos in enumerate(qual):
            x = qpos
            y = np.linspace(0, 1, len(x))[::-1]
            ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qual))),
                    alpha=0.5,
                    lw=2)
        ax.set_xlabel('Phred quality', fontsize=14)
        ax.set_ylabel('Fraction of bases above quality x', fontsize=14)
        ax.set_title('Read'+str(i+1), fontsize=16)
        ax.text(2, 0.03, 'blue to red: 0 to '+str(len(qual))+' base', fontsize=18)

    fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder, \
                get_quality_along_reads_filename
        fig_folder = get_figure_folder(data_folder, adaID)
        fig_filename = get_quality_along_reads_filename(data_folder, adaID)
        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
def plot_cuts_quality_along_reads(data_folder,
                                  adaID,
                                  quality,
                                  title='',
                                  VERBOSE=0,
                                  savefig=False):
    '''Plot some cuts of the quality along the read'''
    from scipy.stats import percentileofscore as pof
    import matplotlib.pyplot as plt
    from matplotlib import cm
    fig, axs = plt.subplots(1, 2, figsize=(14, 8))
    qthreshs = [10, 20, 30, 35]
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qthresh in enumerate(qthreshs):
            x = np.arange(len(qual))
            y = np.array(
                [100 - pof(qual[k], qthresh) for k in xrange(len(qual))])
            ax.plot(x,
                    y,
                    color=cm.jet(int(255.0 * j / len(qthreshs))),
                    alpha=0.8,
                    lw=2,
                    label='Q = ' + str(qthresh))
        ax.set_xlabel('Position [bp]', fontsize=14)
        ax.set_ylabel('Percentage of bases above quality x', fontsize=14)
        ax.set_title('Read' + str(i + 1), fontsize=16)
        ax.set_ylim(-1, 101)
        ax.set_xlim(-1, len(qual) + 1)
        ax.legend(loc='best')

    if title:
        fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs
        if savefig == True:
            from hivwholeseq.sequencing.filenames import get_figure_folder, \
                    get_quality_along_reads_filename
            fig_folder = get_figure_folder(data_folder, adaID)
            fig_filename = get_quality_along_reads_filename(data_folder,
                                                            adaID,
                                                            simple=True)
        elif isinstance(savefig, basestring):
            import os
            fig_filename = savefig
            fig_folder = os.path.dirname(fig_filename)

        else:
            raise ValueError(
                'savefig must be a bool or a figure filename (string)')

        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
Exemplo n.º 6
0
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True):
    '''Produce a report on rough coverage on reference (ignore inserts)'''
    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')

    # Prepare data structures
    coverage = np.zeros(len(refseq), int)

    # Parse the BAM file
    unmapped = 0
    mapped = 0
    bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for read in bamfile:
            if read.is_unmapped or (not read.is_proper_pair) or (not len(
                    read.cigar)):
                unmapped += 1
                continue

            # Proceed along CIGARs
            ref_pos = read.pos
            for (bt, bl) in read.cigar:
                if bt not in (0, 2):
                    continue
                # Treat deletions as 'covered'
                coverage[ref_pos:ref_pos + bl] += 1
                ref_pos += bl
            mapped += 1

    # Save results
    from hivwholeseq.sequencing.filenames import get_coverage_figure_filename
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(1, 1, figsize=(13, 6))
    ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b')
    ax.set_xlabel('Position')
    ax.set_ylabel('Coverage')
    ax.set_yscale('log')
    ax.set_title('adaID ' + adaID + ', premapped', fontsize=18)
    ax.set_xlim(-20, len(refseq) + 20)
    plt.tight_layout()

    from hivwholeseq.utils.generic import mkdirs
    from hivwholeseq.sequencing.filenames import get_figure_folder
    mkdirs(get_figure_folder(data_folder, adaID))
    plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped'))
    plt.close(fig)

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\nPremapping results: '+\
                    str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n')
            f.write('\nCoverage plotted: '+\
                    get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True):
    '''Produce a report on rough coverage on reference (ignore inserts)'''
    ref_filename = get_reference_premap_filename(data_folder, adaID)
    refseq = SeqIO.read(ref_filename, 'fasta')

    # Prepare data structures
    coverage = np.zeros(len(refseq), int)

    # Parse the BAM file
    unmapped = 0
    mapped = 0
    bamfilename = get_premapped_filename(data_folder, adaID, type='bam')
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for read in bamfile:
            if read.is_unmapped or (not read.is_proper_pair) or (not len(
                    read.cigar)):
                unmapped += 1
                continue

            # Proceed along CIGARs
            ref_pos = read.pos
            for (bt, bl) in read.cigar:
                if bt not in (0, 2):
                    continue
                # Treat deletions as 'covered'
                coverage[ref_pos:ref_pos + bl] += 1
                ref_pos += bl
            mapped += 1

    # Save results
    from hivwholeseq.sequencing.filenames import get_coverage_figure_filename
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(1, 1, figsize=(13, 6))
    ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b')
    ax.set_xlabel('Position')
    ax.set_ylabel('Coverage')
    ax.set_yscale('log')
    ax.set_title('adaID ' + adaID + ', premapped', fontsize=18)
    ax.set_xlim(-20, len(refseq) + 20)
    plt.tight_layout()

    from hivwholeseq.utils.generic import mkdirs
    from hivwholeseq.sequencing.filenames import get_figure_folder
    mkdirs(get_figure_folder(data_folder, adaID))
    plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped'))
    plt.close(fig)

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\nPremapping results: '+\
                    str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n')
            f.write('\nCoverage plotted: '+\
                    get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
def plot_cuts_quality_along_reads(data_folder, adaID, quality, title="", VERBOSE=0, savefig=False):
    """Plot some cuts of the quality along the read"""
    from scipy.stats import percentileofscore as pof
    import matplotlib.pyplot as plt
    from matplotlib import cm

    fig, axs = plt.subplots(1, 2, figsize=(14, 8))
    qthreshs = [10, 20, 30, 35]
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qthresh in enumerate(qthreshs):
            x = np.arange(len(qual))
            y = np.array([100 - pof(qual[k], qthresh) for k in xrange(len(qual))])
            ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qthreshs))), alpha=0.8, lw=2, label="Q = " + str(qthresh))
        ax.set_xlabel("Position [bp]", fontsize=14)
        ax.set_ylabel("Percentage of bases above quality x", fontsize=14)
        ax.set_title("Read" + str(i + 1), fontsize=16)
        ax.set_ylim(-1, 101)
        ax.set_xlim(-1, len(qual) + 1)
        ax.legend(loc="best")

    if title:
        fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs

        if savefig == True:
            from hivwholeseq.sequencing.filenames import get_figure_folder, get_quality_along_reads_filename

            fig_folder = get_figure_folder(data_folder, adaID)
            fig_filename = get_quality_along_reads_filename(data_folder, adaID, simple=True)
        elif isinstance(savefig, basestring):
            import os

            fig_filename = savefig
            fig_folder = os.path.dirname(fig_filename)

        else:
            raise ValueError("savefig must be a bool or a figure filename (string)")

        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()
def plot_cuts_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False):
    '''Plot some cuts of the quality along the read'''
    from scipy.stats import percentileofscore as pof
    import matplotlib.pyplot as plt
    from matplotlib import cm
    fig, axs = plt.subplots(1, 2, figsize=(14, 8))
    qthreshs = [10, 20, 30, 35]
    for i, (ax, qual) in enumerate(izip(axs, quality)):
        for j, qthresh in enumerate(qthreshs):
            x = np.arange(len(qual))
            y = np.array([100 - pof(qual[k], qthresh) for k in xrange(len(qual))])
            ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qthreshs))),
                    alpha=0.8,
                    lw=2,
                    label='Q = '+str(qthresh))
        ax.set_xlabel('Position [bp]', fontsize=14)
        ax.set_ylabel('Percentage of bases above quality x', fontsize=14)
        ax.set_title('Read'+str(i+1), fontsize=16)
        ax.set_ylim(-1, 101)
        ax.set_xlim(-1, len(qual) + 1)
        ax.legend(loc='best')

    fig.suptitle(title, fontsize=20)

    if savefig:
        from hivwholeseq.utils.generic import mkdirs
        from hivwholeseq.sequencing.filenames import get_figure_folder, \
                get_quality_along_reads_filename
        fig_folder = get_figure_folder(data_folder, adaID)
        fig_filename = get_quality_along_reads_filename(data_folder, adaID, simple=True)
        mkdirs(fig_folder)
        fig.savefig(fig_filename)

    else:
        plt.tight_layout()
        plt.ion()
        plt.show()