def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, xlog=False): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale :returns: the median value and the percentile inputed as max_size. """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = fhandler.next() des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = fhandler.next() except StopIteration: pass fhandler.close() ax = setup_plot(axe, figsize=(10, 5.5)) max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc50 = np.percentile(des, 50) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % ( perc50, max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all') return perc50, max_perc
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, show=False, xlog=False, stats=('median', 'perc_max')): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale :param ('median', 'perc_max') stats: returns this set of values calculated from the distribution of insert/fragment sizes. Possible values are: - 'median' median of the distribution - 'perc_max' percentil defined by the other parameter 'max_size' - 'first_deacay' starting from the median of the distribution to the first window where 10 consecutive insert sizes are counted less than a given value (this given value is equal to the sum of all sizes divided by 100 000) - 'MAD' Double Median Adjusted Deviation :returns: the median value and the percentile inputed as max_size. """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = fhandler.next() des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = fhandler.next() except StopIteration: pass fhandler.close() max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc50 = np.percentile(des, 50) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) to_return = {'median': perc50} cutoff = len(des) / 100000. count = 0 for v in xrange(int(perc50), int(max(des))): if des.count(v) < cutoff: count += 1 else: count = 0 if count >= 10: to_return['first_decay'] = v - 10 break else: raise Exception('ERROR: not found') to_return['perc_max'] = max_perc to_return['MAD'] = mad(des) if not savefig and not axe and not show: return [to_return[k] for k in stats] ax = setup_plot(axe, figsize=(10, 5.5)) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % ( perc50, max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif show and not axe: plt.show() plt.close('all') return [to_return[k] for k in stats]
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None, xlog=False): """ Plots the distribution of dangling-ends lengths :param fnam: input file name :param None savefig: path where to store the output images. :param 99.9 max_size: top percentage of distances to consider, within the top 0.01% are usually found very long outliers. :param False xlog: represent x axis in logarithmic scale """ distr = {} genome_seq = OrderedDict() fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): if line.startswith('# CRM '): crm, clen = line[6:].split() genome_seq[crm] = int(clen) line = fhandler.next() des = [] if nreads: nreads /= 2 try: while True: (crm1, pos1, dir1, _, re1, _, crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12] if re1==re2 and crm1 == crm2 and dir1 != dir2: pos1, pos2 = int(pos1), int(pos2) if (pos2 > pos1) == int(dir1): des.append(abs(pos2 - pos1)) if len(des) == nreads: break line = fhandler.next() except StopIteration: pass fhandler.close() ax = setup_plot(axe, figsize=(10, 5.5)) max_perc = np.percentile(des, max_size) perc99 = np.percentile(des, 99) perc01 = np.percentile(des, 1) perc95 = np.percentile(des, 95) perc05 = np.percentile(des, 5) desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3, label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99)) ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3) desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3, label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95)) deshist = ax.hist(des, bins=100, range=(0, max_perc), alpha=.7, color='darkred', label='Dangling-ends') ylims = ax.get_ylim() plots = [] ax.set_xlabel('Genomic distance between reads') ax.set_ylabel('Count') ax.set_title('Distribution of dangling-ends ' + 'lenghts\n(top %.1f%%, up to %0.f nts)' % (max_size, max_perc)) if xlog: ax.set_xscale('log') ax.set_xlim((50, max_perc)) plt.subplots_adjust(left=0.1, right=0.75) ax.legend(bbox_to_anchor=(1.4, 1), frameon=False) if savefig: tadbit_savefig(savefig) elif not axe: plt.show() plt.close('all')