Exemplo n.º 1
0
def quality_plot(fnam,
                 r_enz=None,
                 nreads=float('inf'),
                 axe=None,
                 savefig=None,
                 paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'
    )])
    if isinstance(r_enz, list):
        r_enzs = r_enz
    elif isinstance(r_enz, str):
        r_enzs = [r_enz]
    for k in RESTRICTION_ENZYMES.keys():
        for i in range(len(r_enzs)):
            if k.lower() == str(r_enz[i]).lower():
                r_enz[i] = k
    # else let it as None

    quals = []
    henes = []
    sites = {}
    fixes = {}
    liges = {}
    ligep = {}
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    elif fnam.endswith('.dsrc'):
        proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE)
        fhandler = proc.stdout
    else:
        fhandler = open(fnam)
    if len(r_enzs) == 1 and r_enzs[0] is None:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else:  # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_sites = {}
        d_sites = {}
        for r_enz in r_enzs:
            r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '')
            d_sites[r_enz] = repaired(r_enz)
            sites[r_enz] = []  # initialize dico to store undigested sites
            fixes[r_enz] = []  # initialize dico to store digested sites
        l_sites = religateds(r_enzs)
        site = {}
        fixe = {}
        for r_enz in r_enzs:
            site[r_enz] = re.compile(r_sites[r_enz])
            fixe[r_enz] = re.compile(d_sites[r_enz])
        # ligation sites should appear in lower case in the sequence
        lige = {}
        for k in l_sites:
            liges[k] = []  # initialize dico to store sites
            ligep[k] = 0  # initialize dico to store sites
            l_sites[k] = l_sites[k].lower()
            lige[k] = re.compile(l_sites[k])
        while len(quals) <= nreads:
            try:
                next(fhandler)
            except StopIteration:
                break
            seq = next(fhandler)
            # ligation sites replaced by lower case to ease the search
            for lig in l_sites.values():
                seq = seq.replace(lig.upper(), lig)
            for r_enz in r_enzs:
                sites[r_enz].extend(
                    [m.start() for m in site[r_enz].finditer(seq)])
                # TODO: you cannot have a repaired/fixed site in the middle of
                # the sequence, this could be only checked at the beginning
                fixes[r_enz].extend(
                    [m.start() for m in fixe[r_enz].finditer(seq)])
            for k in lige:  # for each paired of cut-site
                liges[k].extend([m.start() for m in lige[k].finditer(seq)])
                ligep[k] += l_sites[k] in seq
            # store the number of Ns found in the sequences
            if 'N' in seq:
                henes.extend([i for i, s in enumerate(seq) if s == 'N'])
            next(fhandler)
            line = next(fhandler)
            quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = izip_longest(*quals, fillvalue=float('nan'))
    meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals])
    max_seq_len = len(meanquals)

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:  # configure plot
        if len(r_enzs) == 1 and r_enzs[0] is None:  # do both plots
            _, ax = plt.subplots(1, 1, figsize=(15, 6))
        else:  # only do the quality_plot plot
            _, (ax, ax2) = plt.subplots(2, 1, figsize=(15, 12))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both',
                       direction='out',
                       top=False,
                       right=False,
                       left=False,
                       bottom=False)
        ax.tick_params(axis='both',
                       direction='out',
                       top=False,
                       right=False,
                       left=False,
                       bottom=False,
                       which='minor')

    ax.errorbar(range(max_seq_len),
                meanquals,
                linewidth=1,
                elinewidth=1,
                color='darkblue',
                yerr=errorquals,
                ecolor='orange')

    ax.set_xlim((0, max_seq_len))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    # quality_plot plot
    axb.plot([henes.count(i) for i in range(max_seq_len)],
             linewidth=1,
             color='black',
             linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try:  # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, max_seq_len))

    # Hi-C plot
    if not (len(r_enzs) == 1 and r_enzs[0] is None):
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' %
                     (', '.join(map(str, r_enzs)), nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')

        # seq_len is the length of the line to plot. we don't want to plot
        # if there is no room for the cut-site, or ligation site.
        site_len = max((max([len(r_sites[k]) for k in r_sites]),
                        max([len(l_sites[k]) for k in l_sites]),
                        max([len(d_sites[k]) for k in d_sites])))
        seq_len = max_seq_len - site_len

        # transform dictionaries of positions into dictionaries of counts
        for r_enz in sites:
            sites[r_enz] = [sites[r_enz].count(k)
                            for k in range(seq_len)]  # Undigested
            fixes[r_enz] = [fixes[r_enz].count(k)
                            for k in range(seq_len)]  # DE
        for r1, r2 in liges:
            liges[(r1,
                   r2)] = [liges[(r1, r2)].count(k)
                           for k in range(seq_len)]  # OK

        # in case the pattern of the repaired cut-site contains the target
        # cut-site pattern. These sites were counted twice, once in the
        # undigested, and once in the repaired. We remove them from the
        # repaired:
        for r_enz in r_enzs:
            if d_sites[r_enz] in r_sites[r_enz]:
                pos = r_sites[r_enz].find(d_sites[r_enz])

                fixes[r_enz] = (fixes[r_enz][:pos] + [
                    fixes[r_enz][k] - sites[r_enz][k - pos]
                    for k in range(pos, seq_len)
                ])
        # same for ligated sites
        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]:
                    continue
                pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1])
                fixes[r_enz1] = (fixes[r_enz1][:pos] + [
                    fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos]
                    for k in range(pos, seq_len)
                ])

        # remove anything that could be in between the two read ends
        if paired:
            for k in sites:
                sites[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len
                fixes[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len
            for k in liges:
                liges[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len

        # plot undigested cut-sites
        color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs))))
        for r_enz in sites:
            # print 'undigested', r_enz
            # print sites[r_enz][:20]
            ax2.plot(
                sites[r_enz],
                linewidth=2,
                color=color.next(),
                alpha=0.9,
                label='Undigested RE site (%s: %s)' %
                (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]])
                else 'Undigested & Dangling-Ends (%s: %s)' %
                (r_enz, r_sites[r_enz]))
        ax2.set_ylabel('Undigested')
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)

        lines, labels = ax2.get_legend_handles_labels()

        ax3 = ax2.twinx()
        color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges))))
        for r1, r2 in liges:
            # print 'ligated', r1, r2
            # print liges[(r1, r2)][:20]
            ax3.plot(liges[(r1, r2)],
                     linewidth=2,
                     color=color.next(),
                     alpha=0.9,
                     label='Ligated (%s-%s: %s)' %
                     (r1, r2, l_sites[(r1, r2)].upper()))
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Ligated')

        tmp_lines, tmp_labels = ax3.get_legend_handles_labels()
        lines.extend(tmp_lines)
        labels.extend(tmp_labels)

        color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs))))
        for i, r_enz in enumerate(r_enzs):
            if any([f > 0 for f in fixes[r_enz]]):
                ax4 = ax2.twinx()
                ax4.spines["right"].set_position(("axes", 1.07))
                make_patch_spines_invisible(ax4)
                ax4.spines["right"].set_visible(True)
                # print 'repaired', r_enz
                # print fixes[r_enz][:20]
                ax4.plot(fixes[r_enz],
                         linewidth=2,
                         color=color.next(),
                         alpha=0.9,
                         label='Dangling-ends (%s: %s)' %
                         (r_enz, d_sites[r_enz]))
                ax4.yaxis.label.set_color('darkgreen')
                ax4.tick_params(axis='y', colors='darkgreen', **tkw)
                ax4.set_ylabel('Dangling-ends')
                tmp_lines, tmp_labels = ax4.get_legend_handles_labels()
                lines.extend(tmp_lines)
                labels.extend(tmp_labels)
            else:
                ax2.set_ylabel('Undigested & Dangling-ends')
        ax2.set_xlim((0, max_seq_len))

        # Count ligation sites
        lig_cnt = {}
        for k in liges:
            lig_cnt[k] = (nansum(liges[k]) - liges[k][0] -
                          liges[k][max_seq_len // 2])

        # Count undigested sites
        sit_cnt = {}
        for r_enz in r_enzs:
            sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] -
                              sites[r_enz][max_seq_len // 2])

        # Count Dangling-Ends
        des = {}
        for r_enz in r_enzs:
            if any([f > 0 for f in fixes[r_enz]]):
                des[r_enz] = (
                    (100. *
                     (fixes[r_enz][0] +
                      (fixes[r_enz][(max_seq_len // 2)] if paired else 0))) //
                    nreads)
            else:
                des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][
                    (max_seq_len // 2)] if paired else 0))) // nreads

        # Decorate plot
        title = ''
        for r_enz in r_enzs:
            lcnt = float(
                sum([
                    lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1)
                    for r_enz1 in r_enzs for r_enz2 in r_enzs
                    if r_enz1 == r_enz or r_enz2 == r_enz
                ]))
            title += (
                'Percentage of digested sites (not considering Dangling-Ends) '
                '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) /
                                  (lcnt + sit_cnt[r_enz])))
        for r_enz in r_enzs:
            title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz,
                                                                   des[r_enz])

        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                title += (
                    'Percentage of reads with ligation site (%s-%s): %.1f%% \n'
                    % (r_enz1, r_enz2,
                       (ligep[(r_enz1, r_enz2)] * 100.) / nreads))
        plt.title(title.strip(), size=10, ha='left', x=0)
        plt.subplots_adjust(right=0.85)
        ax2.legend(lines,
                   labels,
                   bbox_to_anchor=(0.75, 1.0),
                   loc=3,
                   borderaxespad=0.,
                   frameon=False,
                   fontsize=9)
    plt.tight_layout()
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    for k in ligep:
        ligep[k] = (ligep[k] * 100.) / nreads
    if len(r_enzs) == 1 and r_enzs[0] is None:
        return {}, {}
    return des, ligep
Exemplo n.º 2
0
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')])
    quals = []
    henes = []
    sites = []
    fixes = []
    liges = []
    ligep = 0
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    else:
        fhandler = open(fnam)
    if not r_enz:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        l_site = religated(r_enz)
        d_site = repaired(r_enz)
        if r_site*2 == l_site:
            # in case the religated site equals 2 restriction sites (like DnpII)
            site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site)
            fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site)
        else:
            site = re.compile(r_site)
            fixe = re.compile(d_site)
        lige = re.compile(l_site)
        if nreads:
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = zip(*quals)
    meanquals = [np.mean(q) for q in quals]
    errorquals = [np.std(q) for q in quals]

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:
        if r_enz:
            _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12))
        else:
            _, ax = plt.subplots(1,1, figsize=(15, 6))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')
    ax.errorbar(range(len(line.strip())), meanquals,
                linewidth=1, elinewidth=1, color='darkblue',
                yerr=errorquals, ecolor='orange')

    ax.set_xlim((0, len(line)))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1,
             color='black', linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try: # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, len(line)))

    if r_enz:
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (
            r_enz, nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')
        seq_len = len(line) - max((len(r_site), len(l_site), len(d_site)))
        sites = [sites.count(k) for k in xrange(seq_len)] # Undigested
        liges = [liges.count(k) for k in xrange(seq_len)] # OK
        fixes = [fixes.count(k) for k in xrange(seq_len)] # DE
        if d_site in r_site:
            pos = r_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)])
        if d_site in l_site:
            pos = l_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)])
        site_len = max((len(r_site), len(l_site), len(d_site)))
        if paired:
            sites[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            liges[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            fixes[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
        ax2.plot(sites, linewidth=2, color='darkred')
        ax2.set_ylabel('Undigested RE site (%s)' % r_site)
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)
        ax3 = ax2.twinx()
        ax3.plot(liges, linewidth=2, color='darkblue')
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Religated (%s)' % l_site)
        if any([f > 0 for f in fixes]):
            ax4 = ax2.twinx()
            ax4.spines["right"].set_position(("axes", 1.07))
            make_patch_spines_invisible(ax4)
            ax4.spines["right"].set_visible(True)        
            ax4.plot(fixes, linewidth=2, color='darkorange')
            ax4.yaxis.label.set_color('darkorange')
            ax4.tick_params(axis='y', colors='darkorange', **tkw)
            ax4.set_ylabel('Dangling-ends (%s)' % d_site)
        else:
            ax2.set_ylabel('RE site & Dangling-ends  (%s)' % r_site)
        ax2.set_xlim((0, len(line)))
        lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2])
        sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2])
        des = ((100. * (fixes[0] + (fixes[(len(line) / 2)]
                                            if paired else 0)))
                       / nreads) if any([f > 0 for f in fixes]) else (
            100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads
        plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' +
                   'Percentage of reads with ligation site: %.0f%%') %(
                      (100. * lig_cnt) / (lig_cnt + sit_cnt),
                      des,
                      (ligep * 100.) / nreads))
        plt.subplots_adjust(right=0.85)
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    return des, (ligep * 100.) / nreads
Exemplo n.º 3
0
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')])
    quals = []
    henes = []
    sites = []
    fixes = []
    liges = []
    ligep = 0
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    elif fnam.endswith('.dsrc'):
        proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE)
        fhandler = proc.stdout
    else:
        fhandler = open(fnam)
    if not r_enz:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        l_site = religated(r_enz)
        d_site = repaired(r_enz)
        if r_site*2 == l_site:
            # in case the religated site equals 2 restriction sites (like DnpII)
            site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site)
            fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site)
        else:
            site = re.compile(r_site)
            fixe = re.compile(d_site)
        lige = re.compile(l_site)
        if nreads:
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = zip(*quals)
    meanquals = [np.mean(q) for q in quals]
    errorquals = [np.std(q) for q in quals]

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:
        if r_enz:
            _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12))
        else:
            _, ax = plt.subplots(1,1, figsize=(15, 6))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')
    ax.errorbar(range(len(line.strip())), meanquals,
                linewidth=1, elinewidth=1, color='darkblue',
                yerr=errorquals, ecolor='orange')

    ax.set_xlim((0, len(line)))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1,
             color='black', linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try: # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, len(line)))

    if r_enz:
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (
            r_enz, nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')
        seq_len = len(line) - max((len(r_site), len(l_site), len(d_site)))
        sites = [sites.count(k) for k in xrange(seq_len)] # Undigested
        liges = [liges.count(k) for k in xrange(seq_len)] # OK
        fixes = [fixes.count(k) for k in xrange(seq_len)] # DE
        if d_site in r_site:
            pos = r_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)])
        if d_site in l_site:
            pos = l_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)])
        site_len = max((len(r_site), len(l_site), len(d_site)))
        if paired:
            sites[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            liges[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            fixes[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
        ax2.plot(sites, linewidth=2, color='darkred')
        ax2.set_ylabel('Undigested RE site (%s)' % r_site)
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)
        ax3 = ax2.twinx()
        ax3.plot(liges, linewidth=2, color='darkblue')
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Religated (%s)' % l_site)
        if any([f > 0 for f in fixes]):
            ax4 = ax2.twinx()
            ax4.spines["right"].set_position(("axes", 1.07))
            make_patch_spines_invisible(ax4)
            ax4.spines["right"].set_visible(True)        
            ax4.plot(fixes, linewidth=2, color='darkorange')
            ax4.yaxis.label.set_color('darkorange')
            ax4.tick_params(axis='y', colors='darkorange', **tkw)
            ax4.set_ylabel('Dangling-ends (%s)' % d_site)
        else:
            ax2.set_ylabel('RE site & Dangling-ends  (%s)' % r_site)
        ax2.set_xlim((0, len(line)))
        lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2])
        sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2])
        des = ((100. * (fixes[0] + (fixes[(len(line) / 2)]
                                            if paired else 0)))
                       / nreads) if any([f > 0 for f in fixes]) else (
            100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads
        plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' +
                   'Percentage of reads with ligation site: %.0f%%') %(
                      (100. * lig_cnt) / (lig_cnt + sit_cnt),
                      des,
                      (ligep * 100.) / nreads))
        plt.subplots_adjust(right=0.85)
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    return des, (ligep * 100.) / nreads
Exemplo n.º 4
0
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')])
    if isinstance(r_enz, list):
        r_enzs = r_enz
    elif isinstance(r_enz, str):
        r_enzs = [r_enz]
    for k in RESTRICTION_ENZYMES.keys():
        for i in range(len(r_enzs)):
            if k.lower() == r_enz[i].lower():
                r_enz[i] = k
    # else let it as None

    quals = []
    henes = []
    sites = {}
    fixes = {}
    liges = {}
    ligep = {}
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    elif fnam.endswith('.dsrc'):
        proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE)
        fhandler = proc.stdout
    else:
        fhandler = open(fnam)
    if not r_enzs:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_sites = {}
        d_sites = {}
        for r_enz in r_enzs:
            r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '')
            d_sites[r_enz] = repaired(r_enz)
            sites[r_enz] = []  # initialize dico to store undigested sites
            fixes[r_enz] = []  # initialize dico to store digested sites
        l_sites = religateds(r_enzs)
        site = {}
        fixe = {}
        for r_enz in r_enzs:
            site[r_enz] = re.compile(r_sites[r_enz])
            fixe[r_enz] = re.compile(d_sites[r_enz])
        # ligation sites should appear in lower case in the sequence
        lige = {}
        for k in l_sites:
            liges[k] = []  # initialize dico to store sites
            ligep[k] = 0   # initialize dico to store sites
            l_sites[k] = l_sites[k].lower()
            lige[k] = re.compile(l_sites[k])
        while len(quals) <= nreads:
            try:
                next(fhandler)
            except StopIteration:
                break
            seq = next(fhandler)
            # ligation sites replaced by lower case to ease the search
            for lig in l_sites.values():
                seq = seq.replace(lig.upper(), lig)
            for r_enz in r_enzs:
                sites[r_enz].extend([m.start() for m in site[r_enz].finditer(seq)])
                # TODO: you cannot have a repaired/fixed site in the middle of
                # the sequence, this could be only checked at the beginning
                fixes[r_enz].extend([m.start() for m in fixe[r_enz].finditer(seq)])
            for k  in lige:  # for each paired of cut-site
                liges[k].extend([m.start() for m in lige[k].finditer(seq)])
                ligep[k] += l_sites[k] in seq
            # store the number of Ns found in the sequences
            if 'N' in seq:
                henes.extend([i for i, s in enumerate(seq) if s == 'N'])
            next(fhandler)
            line = next(fhandler)
            quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = izip_longest(*quals, fillvalue=float('nan'))
    meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals])
    max_seq_len = len(meanquals)

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:  # configure plot
        if r_enz:  # do both plots
            _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12))
        else:  # only do the quality_plot plot
            _, ax = plt.subplots(1,1, figsize=(15, 6))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')

    ax.errorbar(range(max_seq_len), meanquals,
                linewidth=1, elinewidth=1, color='darkblue',
                yerr=errorquals, ecolor='orange')

    ax.set_xlim((0, max_seq_len))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    # quality_plot plot
    axb.plot([henes.count(i) for i in xrange(max_seq_len)], linewidth=1,
             color='black', linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try: # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, max_seq_len))

    # Hi-C plot
    if r_enzs:
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (
            ', '.join(r_enzs), nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')

        # seq_len is the length of the line to plot. we don't want to plot
        # if there is no room for the cut-site, or ligation site.
        site_len = max((max([len(r_sites[k]) for k in r_sites]),
                                   max([len(l_sites[k]) for k in l_sites]),
                                   max([len(d_sites[k]) for k in d_sites])))
        seq_len = max_seq_len - site_len

        # transform dictionaries of positions into dictionaries of counts
        for r_enz in sites:
            sites[r_enz] = [sites[r_enz].count(k) for k in xrange(seq_len)] # Undigested
            fixes[r_enz] = [fixes[r_enz].count(k) for k in xrange(seq_len)] # DE
        for r1, r2 in liges:
            liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in xrange(seq_len)] # OK

        # in case the pattern of the repaired cut-site contains the target
        # cut-site pattern. These sites were counted twice, once in the
        # undigested, and once in the repaired. We remove them from the
        # repaired:
        for r_enz in r_enzs:
            if d_sites[r_enz] in r_sites[r_enz]:
                pos = r_sites[r_enz].find(d_sites[r_enz])

                fixes[r_enz] = (fixes[r_enz][:pos] +
                                [fixes[r_enz][k] - sites[r_enz][k-pos]
                                 for k in xrange(pos, seq_len)])
        # same for ligated sites
        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]:
                    continue
                pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1])
                fixes[r_enz1] = (fixes[r_enz1][:pos] +
                                 [fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos]
                                  for k in xrange(pos, seq_len)])

        # remove anything that could be in between the two read ends
        if paired:
            for k in sites:
                sites[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len
                fixes[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len
            for k in liges:
                liges[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len

        # plot undigested cut-sites
        color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs))))
        for r_enz in sites:
            # print 'undigested', r_enz
            # print sites[r_enz][:20]
            ax2.plot(sites[r_enz], linewidth=2, color = color.next(),
                     alpha=0.9,
                     label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz])
                     if any([f > 0 for f in fixes[r_enz]])
                     else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz]))
        ax2.set_ylabel('Undigested')
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)

        lines, labels = ax2.get_legend_handles_labels()

        ax3 = ax2.twinx()
        color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges))))
        for r1, r2 in liges:
            # print 'ligated', r1, r2
            # print liges[(r1, r2)][:20]
            ax3.plot(liges[(r1, r2)], linewidth=2, color=color.next(),
                     alpha=0.9,
                     label = 'Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper()))
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Ligated')

        tmp_lines, tmp_labels = ax3.get_legend_handles_labels()
        lines.extend(tmp_lines)
        labels.extend(tmp_labels)

        color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs))))
        for i, r_enz in enumerate(r_enzs):
            if any([f > 0 for f in fixes[r_enz]]):
                ax4 = ax2.twinx()
                ax4.spines["right"].set_position(("axes", 1.07))
                make_patch_spines_invisible(ax4)
                ax4.spines["right"].set_visible(True)
                # print 'repaired', r_enz
                # print fixes[r_enz][:20]
                ax4.plot(fixes[r_enz], linewidth=2, color=color.next(),
                         alpha=0.9,
                         label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz]))
                ax4.yaxis.label.set_color('darkgreen')
                ax4.tick_params(axis='y', colors='darkgreen', **tkw)
                ax4.set_ylabel('Dangling-ends')
                tmp_lines, tmp_labels = ax4.get_legend_handles_labels()
                lines.extend(tmp_lines)
                labels.extend(tmp_labels)
            else:
                ax2.set_ylabel('Undigested & Dangling-ends')
        ax2.set_xlim((0, max_seq_len))

        # Count ligation sites
        lig_cnt = {}
        for k in liges:
            lig_cnt[k] = (nansum(liges[k]) - liges[k][0] -
                              liges[k][max_seq_len / 2])

        # Count undigested sites
        sit_cnt = {}
        for r_enz in r_enzs:
            sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] -
                              sites[r_enz][max_seq_len / 2])

        # Count Dangling-Ends
        des = {}
        for r_enz in r_enzs:
            if any([f > 0 for f in fixes[r_enz]]):
                des[r_enz] = ((100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len / 2)]
                                                          if paired else 0))) / nreads)
            else:
                des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][(max_seq_len / 2)]
                                                         if paired else 0))) / nreads

        # Decorate plot
        title = ''
        for r_enz in r_enzs:
            lcnt = float(sum([lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1)
                              for r_enz1 in r_enzs for r_enz2 in r_enzs
                              if r_enz1 == r_enz or r_enz2 == r_enz]))
            title += ('Percentage of digested sites (not considering Dangling-Ends) '
                      '%s: %.1f%%\n' % (r_enz,
                                        100. * float(lcnt) / (lcnt + sit_cnt[r_enz])))
        for r_enz in r_enzs:
            title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz])

        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                title += ('Percentage of reads with ligation site (%s-%s): %.1f%% \n' %
                          (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads))
        plt.title(title.strip(), size=10, ha='left', x=0)
        plt.subplots_adjust(right=0.85)
        ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0),
                   loc=3, borderaxespad=0., frameon=False, fontsize=9)
    plt.tight_layout()
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    for k in ligep:
        ligep[k] = (ligep[k] * 100.) / nreads
    return des, ligep