Пример #1
0
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None,
                 xlog=False):
    """
    Plots the distribution of dangling-ends lengths
    :param fnam: input file name
    :param None savefig: path where to store the output images.
    :param 99.9 max_size: top percentage of distances to consider, within the
       top 0.01% are usually found very long outliers.
    :param False xlog: represent x axis in logarithmic scale

    :returns: the median value and the percentile inputed as max_size.
    """
    distr = {}
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen)
        line = fhandler.next()
    des = []
    if nreads:
        nreads /= 2
    try:
        while True:
            (crm1, pos1, dir1, _, re1, _,
             crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12]
            if re1==re2 and crm1 == crm2 and dir1 != dir2:
                pos1, pos2 = int(pos1), int(pos2)
                if (pos2 > pos1) == int(dir1):
                    des.append(abs(pos2 - pos1))
                if len(des) == nreads:
                    break
            line = fhandler.next()
    except StopIteration:
        pass
    fhandler.close()
    ax = setup_plot(axe, figsize=(10, 5.5))
    max_perc = np.percentile(des, max_size)
    perc99   = np.percentile(des, 99)
    perc01   = np.percentile(des, 1)
    perc50   = np.percentile(des, 50)
    perc95   = np.percentile(des, 95)
    perc05   = np.percentile(des, 5)
    desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3,
                         label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99))
    ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3)
    desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3,
                         label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95))
    deshist = ax.hist(des, bins=100, range=(0, max_perc),
                      alpha=.7, color='darkred', label='Dangling-ends')
    ylims   = ax.get_ylim()
    plots   = []
    ax.set_xlabel('Genomic distance between reads')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of dangling-ends ' +
                 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % (
                     perc50, max_size, max_perc))
    if xlog:
        ax.set_xscale('log')
    ax.set_xlim((50, max_perc))
    plt.subplots_adjust(left=0.1, right=0.75)
    ax.legend(bbox_to_anchor=(1.4, 1), frameon=False)
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
    plt.close('all')
    return perc50, max_perc
Пример #2
0
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None,
                 show=False, xlog=False, stats=('median', 'perc_max')):
    """
    Plots the distribution of dangling-ends lengths
    :param fnam: input file name
    :param None savefig: path where to store the output images.
    :param 99.9 max_size: top percentage of distances to consider, within the
       top 0.01% are usually found very long outliers.
    :param False xlog: represent x axis in logarithmic scale
    :param ('median', 'perc_max') stats: returns this set of values calculated from the
       distribution of insert/fragment sizes. Possible values are:
        - 'median' median of the distribution
        - 'perc_max' percentil defined by the other parameter 'max_size'
        - 'first_deacay' starting from the median of the distribution to the
            first window where 10 consecutive insert sizes are counted less than
            a given value (this given value is equal to the sum of all
            sizes divided by 100 000)
        - 'MAD' Double Median Adjusted Deviation

    :returns: the median value and the percentile inputed as max_size.
    """
    distr = {}
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen)
        line = fhandler.next()
    des = []
    if nreads:
        nreads /= 2
    try:
        while True:
            (crm1, pos1, dir1, _, re1, _,
             crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12]
            if re1==re2 and crm1 == crm2 and dir1 != dir2:
                pos1, pos2 = int(pos1), int(pos2)
                if (pos2 > pos1) == int(dir1):
                    des.append(abs(pos2 - pos1))
                if len(des) == nreads:
                    break
            line = fhandler.next()
    except StopIteration:
        pass
    fhandler.close()
    max_perc = np.percentile(des, max_size)
    perc99   = np.percentile(des, 99)
    perc01   = np.percentile(des, 1)
    perc50   = np.percentile(des, 50)
    perc95   = np.percentile(des, 95)
    perc05   = np.percentile(des, 5)
    to_return = {'median': perc50}
    cutoff = len(des) / 100000.
    count  = 0
    for v in xrange(int(perc50), int(max(des))):
        if des.count(v) < cutoff:
            count += 1
        else:
            count = 0
        if count >= 10:
            to_return['first_decay'] = v - 10
            break
    else:
        raise Exception('ERROR: not found')
    to_return['perc_max'] = max_perc
    to_return['MAD'] = mad(des)
    if not savefig and not axe and not show:
        return [to_return[k] for k in stats]
    
    ax = setup_plot(axe, figsize=(10, 5.5))
    desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3,
                         label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99))
    ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3)
    desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3,
                         label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95))
    deshist = ax.hist(des, bins=100, range=(0, max_perc),
                      alpha=.7, color='darkred', label='Dangling-ends')
    ylims   = ax.get_ylim()
    plots   = []
    ax.set_xlabel('Genomic distance between reads')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of dangling-ends ' +
                 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % (
                     perc50, max_size, max_perc))
    if xlog:
        ax.set_xscale('log')
    ax.set_xlim((50, max_perc))
    plt.subplots_adjust(left=0.1, right=0.75)
    ax.legend(bbox_to_anchor=(1.4, 1), frameon=False)
    if savefig:
        tadbit_savefig(savefig)
    elif show and not axe:
        plt.show()
    plt.close('all')
    return [to_return[k] for k in stats]
Пример #3
0
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None,
                 xlog=False):
    """
    Plots the distribution of dangling-ends lengths
    :param fnam: input file name
    :param None savefig: path where to store the output images.
    :param 99.9 max_size: top percentage of distances to consider, within the
       top 0.01% are usually found very long outliers.
    :param False xlog: represent x axis in logarithmic scale
    """
    distr = {}
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen)
        line = fhandler.next()
    des = []
    if nreads:
        nreads /= 2
    try:
        while True:
            (crm1, pos1, dir1, _, re1, _,
             crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12]
            if re1==re2 and crm1 == crm2 and dir1 != dir2:
                pos1, pos2 = int(pos1), int(pos2)
                if (pos2 > pos1) == int(dir1):
                    des.append(abs(pos2 - pos1))
                if len(des) == nreads:
                    break
            line = fhandler.next()
    except StopIteration:
        pass
    fhandler.close()
    ax = setup_plot(axe, figsize=(10, 5.5))
    max_perc = np.percentile(des, max_size)
    perc99  = np.percentile(des, 99)
    perc01  = np.percentile(des, 1)
    perc95  = np.percentile(des, 95)
    perc05  = np.percentile(des, 5)
    desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3,
                         label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99))
    ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3)
    desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3,
                         label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95))
    deshist = ax.hist(des, bins=100, range=(0, max_perc),
                      alpha=.7, color='darkred', label='Dangling-ends')
    ylims   = ax.get_ylim()
    plots   = []
    ax.set_xlabel('Genomic distance between reads')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of dangling-ends ' +
                 'lenghts\n(top %.1f%%, up to %0.f nts)' % (max_size, max_perc))
    if xlog:
        ax.set_xscale('log')
    ax.set_xlim((50, max_perc))
    plt.subplots_adjust(left=0.1, right=0.75)
    ax.legend(bbox_to_anchor=(1.4, 1), frameon=False)
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
    plt.close('all')