def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter min_count = opts.min_count ncpus = opts.cpus factor = 1 outdir = opts.outdir sigma = 2 mkdir(outdir) sys.stdout.write('\nNormalization of full genome\n') biases, decay, badcol = read_bam(inbam, filter_exclude, resolution, min_count=min_count, ncpus=ncpus, sigma=sigma, factor=factor, outdir=outdir, check_sum=opts.check_sum) printime(' - Saving biases and badcol columns') # biases out = open(os.path.join(outdir, 'biases_%s.pickle' % ( nicer(resolution).replace(' ', ''))), 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': resolution}, out) out.close() # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=()) printime('\nDone.')
def main(): opts = get_options() inbam = opts.inbam resolution = opts.resolution outfile = opts.outfile biases_file = opts.biases_file window = opts.window if window not in ['inter', 'intra', 'all']: window = [int(x) / resolution for x in window.split('-')] if window[0] >= window[1]: raise Exception('ERROR: beginning of window should be smaller ' 'than end') nheader = write_matrix(inbam, resolution, biases_file, outfile, nchunks=opts.nchunks, ncpus=opts.ncpus, clean=opts.clean, window=window) rand_hash = "%016x" % getrandbits(64) tmpdir = os.path.join('.', '_tmp_%s' % (rand_hash)) mkdir(tmpdir) #sort all files for only read once per pair of peaks to extract sort_BAMtsv(nheader, outfile, tmpdir) os.system('rm -rf {}'.format(tmpdir)) printime('Done.')
def write_matrix(inbam, resolution, biases, outdir, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region1=None, start1=None, end1=None, clean=True, region2=None, start2=None, end2=None, tmpdir='.', ncpus=8, verbose=True): if not isinstance(filter_exclude, int): filter_exclude = filters_to_bin(filter_exclude) regions, rand_hash, bin_coords, chunks = read_bam( inbam, filter_exclude, resolution, ncpus=ncpus, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, verbose=verbose) bamfile = pysam.AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references,[x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] if biases: bias1, bias2, decay, bads1, bads2 = get_biases_region(biases, bin_coords) else: bads1 = bads2 = {} start_bin1, start_bin2 = bin_coords[::2] if verbose: printime(' - Writing matrices') fnam = outdir + '{}_mat_{}kb.tsv'.format(region1, resolution / 1000) mkdir (outdir) out = open(os.path.join(outdir, fnam), 'w') # pull all sub-matrices and write full matrix for c,j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if k < j: # we are only going to keep half of the matrix continue if j not in bads1 and k not in bads2 and abs(j-k) in decay[c]: n = v / bias1[j] / bias2[k] / decay[c][abs(j-k)] pos1 = j + section_pos[region1][0] pos2 = k + section_pos[region1][0] out.write('{}\t{}\t{}\t{}\n'.format(pos1, pos2, v, n)) out.close() # this is the last thing we do in case something goes wrong os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))) if verbose: printime('\nDone.')
def sort_BAMtsv(nheader, outfile, tmp): tsv = outfile printime('Sorting BAM matrix: {}'.format(tsv)) # sort file first and second column and write to same file print(("(head -n {0} {1} && tail -n +{0} {1} | " "sort -k1n -k2n -S 10% -T {2}) > {1}").format(nheader, tsv, tmp)) _ = Popen(("(head -n {0} {2} && tail -n +{1} {2} | " "sort -k1n -k2n -S 10% -T {3}) > {2}_").format( nheader, nheader + 1, tsv, tmp), shell=True).communicate() os.system("mv {0}_ {0}".format(tsv))
def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter min_count = opts.min_count ncpus = opts.cpus factor = 1 outdir = opts.outdir sigma = 2 mkdir(outdir) sys.stdout.write('\nNormalization of full genome\n') biases, decay, badcol = read_bam(inbam, filter_exclude, resolution, min_count=min_count, ncpus=ncpus, sigma=sigma, factor=factor, outdir=outdir, check_sum=opts.check_sum) printime(' - Saving biases and badcol columns') # biases out = open( os.path.join(outdir, 'biases_%s.pickle' % (nicer(resolution).replace(' ', ''))), 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': resolution }, out) out.close() # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=()) printime('\nDone.')
def main(): opts = get_options() inbam = opts.inbam resolution = opts.resolution outdir = opts.outdir tmppath = opts.tmppath biases_file = opts.biases_file dry_run = opts.dry_run # a bit of hardcoded parameter never hurts metric = 'loop' printime('Generating huge matrix') nheader,outfile = write_big_matrix(inbam, resolution, biases_file, outdir, nchunks=opts.nchunks, wanted_chrom=opts.chrom, wanted_pos1=opts.pos1, wanted_pos2=opts.pos2, dry_run=dry_run, ncpus=opts.ncpus, tmpdir=tmppath, clean=not opts.dirty, verbose=opts.verbose, waffle_radii=opts.waffle_radii, metric=metric)
def run(opts): check_options(opts) samtools = which(opts.samtools) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bam1: mreads1 = path.realpath(opts.bam1) biases1 = opts.biases1 else: biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) try: biases1 = path.join(opts.workdir1, biases1) except AttributeError: biases1 = None except TypeError: # Py3 biases1 = None if opts.bam2: mreads2 = path.realpath(opts.bam2) biases2 = opts.biases2 else: biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) try: biases2 = path.join(opts.workdir2, biases2) except AttributeError: biases2 = None except TypeError: # Py3 biases1 = None filter_exclude = opts.filter if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: printime(' - loading first sample %s' % (mreads1)) hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) printime(' - loading second sample %s' % (mreads2)) hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) if opts.workdir1 and opts.workdir2: masked1 = {'valid-pairs': {'count': 0}} masked2 = {'valid-pairs': {'count': 0}} else: masked1 = {'valid-pairs': {'count': sum(hic_data1.values())}} masked2 = {'valid-pairs': {'count': sum(hic_data2.values())}} decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) printime(' - comparing experiments') printime(' => correlation between equidistant loci') corr, _, scc, std, bads = correlate_matrices( hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print(' - correlation score (SCC): %.4f (+- %.7f)' % (scc, std)) printime(' => correlation between eigenvectors') eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) printime(' => reproducibility score') reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm, verbose=False, remove_bad_columns=True) print(' - reproducibility score: %.4f' % (reprod)) ncols = len(hic_data1) else: ncols = 0 decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' masked1 = {} masked2 = {} corr = eig_corr = scc = std = reprod = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) if not opts.skip_merge: outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) printime(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2)) printime(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE, universal_newlines=True).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (opts.cpus, outbam)) else: system(samtools + ' index %s' % (outbam)) else: outbam = '' finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(list(bads.keys())), ncols, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr, biases1, biases2, masked1, masked2, launch_time, finish_time) printime('\nDone.')
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None if opts.figsize: opts.figsize = map(float, opts.figsize.split(',')) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v !='raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None if opts.plot and not opts.force_plot: if opts.interactive: max_size = 1500**2 else: max_size = 5000**2 else: max_size = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict(zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = OrderedDict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean, max_size=max_size) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemented ' 'for matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions) for p in range(starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(opts.reso, sep=''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: # transform matrix matrix = array([array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2)]) m = zeros_like(matrix) for bad1 in bads1: m[:,bad1] = 1 for bad2 in bads2: m[bad2,:] = 1 matrix = ma.masked_array(matrix, m) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s%s.%s' % ( norm, name, nicer(opts.reso, sep=''), ('_' + param_hash), '_tri' if opts.triangular else '', opts.format) out_plots[norm_string] = path.join(outdir, fnam) pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = 0 if start2 is None else start2 pltend2 = sections[regions[-1]] if end2 is None else end2 xlabel = '{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1) ylabel = '{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2) section_pos = OrderedDict((k, section_pos[k]) for k in section_pos if k in regions) ax1, _ = plot_HiC_matrix( matrix, triangular=opts.triangular, vmin=vmin, vmax=vmax, cmap=opts.cmap, figsize=opts.figsize, bad_color=opts.bad_color if norm != 'raw' else None) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % ( name, norm, nicer(opts.reso)), y=1.05) _format_axes(ax1, start1, end1, start2, end2, opts.reso, regions, section_pos, sections, opts.xtick_rotation, triangular=False) if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update(write_matrix( mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s '% tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def write_matrix(inbam, resolution, biases, outfile, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region1=None, start1=None, end1=None, clean=True, region2=None, start2=None, end2=None, nchunks=100, tmpdir='.', ncpus=8, verbose=True, window=None): if not isinstance(filter_exclude, int): filter_exclude = filters_to_bin(filter_exclude) _, rand_hash, bin_coords, chunks = read_bam(inbam, filter_exclude, resolution, ncpus=ncpus, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, nchunks=nchunks, verbose=verbose) bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] if biases: bias1, bias2, decay, bads1, bads2 = get_biases_region( biases, bin_coords) transform = lambda x, c, j, k: x / bias1[j] / bias2[k] / decay[c][abs( k - j)] transform2 = lambda x, j, k: x / bias1[j] / bias2[k] else: bads1 = bads2 = {} transform = transform2 = lambda x, c, k, j: x if bads1 is bads2: badcols = bads1 else: # should never happen badcols = bads1 badcols.update(bads2) if verbose: printime(' - Writing matrices') mkdir(os.path.split(os.path.abspath(outfile))[0]) # write the rest of the file to be sorted out = open(outfile, 'w') nheader = 0 for i, c in enumerate(bamfile.references): out.write('# CHROM\t{}\t{}\n'.format(c, bamfile.lengths[i])) nheader += 1 out.write('# RESOLUTION\t{}\n'.format(resolution)) nheader += 1 out.write('# BADCOLS\t{}\n'.format(','.join(map(str, badcols.keys())))) nheader += 1 if window == 'all': outside = lambda c_, j_, k_: False elif window == 'intra': outside = lambda c_, j_, k_: c_ == '' elif window == 'inter': outside = lambda c_, j_, k_: c_ != '' else: min_, max_ = window outside = lambda c_, j_, k_: (k_ - j_) < min_ or (k_ - j_) > max_ # pull all sub-matrices and write full matrix for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if k < j or j in badcols or k in badcols: # we keep only half matrix continue if outside(c, j, k): continue try: n = transform(v, c, j, k) # normalize except KeyError: n = transform2(v, j, k) # normalize no decay out.write('{}\t{}\t{}\t{}\n'.format(j, k, v, n)) out.close() # this is the last thing we do in case something goes wrong if clean: os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))) return nheader
def read_bam(inbam, filter_exclude, resolution, min_count=2500, sigma=2, ncpus=8, factor=1, outdir='.', check_sum=False): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] + 1 bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm + 1)]) start_bin = 0 end_bin = len(bins) + 1 total = len(bins) total = end_bin - start_bin + 1 regs = [] begs = [] ends = [] njobs = min(total, 100) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop at the right place nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime('\n - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append(pool.apply_async( read_bam_frag, args=(inbam, filter_exclude, bins, bins_dict, resolution, outdir, region, start, end,))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS verbose = True cisprc = {} for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if verbose: if not countbin % 10 and countbin: sys.stdout.write(' ') if not countbin % 50 and countbin: sys.stdout.write(' %9s\n ' % ('%s/%s' % (countbin , len(regs)))) sys.stdout.write('.') sys.stdout.flush() fname = os.path.join(outdir, 'tmp_bins_%s:%d-%d.pickle' % (region, start, end)) tmp_cisprc = load(open(fname)) cisprc.update(tmp_cisprc) if verbose: print '%s %9s\n' % (' ' * (54 - (countbin % 50) - (countbin % 50) / 10), '%s/%s' % (len(regs),len(regs))) # out = open(os.path.join(outdir, 'dicos_%s.pickle' % ( # nicer(resolution).replace(' ', ''))), 'w') # dump(cisprc, out) # out.close() # bad columns def func_gen(x, *args): cmd = "zzz = " + func_restring % (args) exec(cmd) in globals(), locals() #print cmd try: return np.lib.asarray_chkfinite(zzz) except: # avoid the creation of NaNs when invalid values for power or log return x print ' - Removing columns with too few or too much interactions' if not min_count: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, savefig=os.path.join(outdir + 'filtered_bins_%s.png' % ( nicer(resolution).replace(' ', '')))) else: print ' -> too few interactions defined as less than %9d interactions' % ( min_count) for k in cisprc: cisprc[k] = cisprc[k][1] badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, 0) < min_count: badcol[c] = cisprc.get(c, 0) countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) printime(' - Rescaling biases') size = len(bins) biases = [cisprc.get(k, 1.) for k in range(size)] mean_col = float(sum(biases)) / len(biases) biases = dict([(k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)]) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) # check the sum if check_sum: pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) print 'SUM:', sumnrm printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append(pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results sumdec = {} for proc in procs: for k, v in proc.get().iteritems(): try: sumdec[k] += v except KeyError: sumdec[k] = v # count the number of cells per diagonal # TODO: parallelize # find larget chromsome len_big = max(section_pos[c][1] - section_pos[c][0] for c in section_pos) # initialize dictionary ndiags = dict((k, 0) for k in xrange(len_big)) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set() # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b <= maxp: bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[dist] -= len(bad_diag) # chr_sizeerent behavior for longest diagonal: ndiags[0] += chr_size - len(thesebads) # normalize sum per diagonal by total number of cells in diagonal for k in sumdec: try: sumdec[k] /= ndiags[k] except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, sumdec, badcol
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) coord1 = opts.coord1 if not coord1: region1 = None start1 = None end1 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None printime('Importing hic in %s format' % opts.format) if opts.format == 'matrix' or opts.format == 'text': with gzopen(opts.input) as f_thing: masked, chroms_gen, crm, beg, _, _ = read_file_header(f_thing) if not chroms_gen or (region1 and region1 not in chroms_gen): raise Exception( '''ERROR: Chromosome size not included in import file. Please include the chromosome sizes of the data that you want to import in the header of the file. Example: # CRM chr1 249250621''') elif opts.format == 'cooler': if is_cooler(opts.input, opts.reso if opts.reso > 1 else None): chroms_gen = parse_header(opts.input, opts.reso if opts.reso > 1 else None) if not chroms_gen or (region1 and region1 not in chroms_gen): raise Exception( '''ERROR: Chromosome size not included in import file. ''') else: raise Exception('''ERROR: The input file is not a cooler''') chroms = OrderedDict( (crm, int(chroms_gen[crm] // opts.reso) + 1) for crm in chroms_gen) sections = [] if not region1: size = 0 for crm in chroms: size += chroms[crm] sections.extend([(crm, i) for i in range(chroms[crm])]) elif not start1: size = chroms[region1] sections.extend([(region1, i) for i in range(size)]) else: #size = (end1 - start1)//opts.reso size = chroms[region1] sections.extend([ (region1, i) for i in range(start1 // opts.reso, (end1 // opts.reso)) ]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) bias_file = None badcol = {} if opts.format == 'text': with gzopen(opts.input) as f_thing: matrix = abc_reader(f_thing, size, start1 // opts.reso if start1 else None) size_mat = size elif opts.format == 'matrix': with gzopen(opts.input) as in_f: matrix, size_mat, _, masked, _ = autoreader(in_f) if size != size_mat: raise Exception('''ERROR: The size of the specified region is different from the data in the matrix''') elif opts.format == 'cooler': matrix, weights, size, header = parse_cooler( opts.input, opts.reso if opts.reso > 1 else None, normalized=True, raw_values=True) masked = {} size_mat = size if len(set(weights)) > 1: printime('Transforming cooler weights to biases') outdir_norm = path.join(opts.workdir, '04_normalization') mkdir(outdir_norm) bias_file = path.join( outdir_norm, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') badcol.update((i, True) for i, m in enumerate(weights) if m == 0) dump( { 'biases': dict((k, b if b > 0 else float('nan')) for k, b in enumerate(weights)), 'decay': {}, 'badcol': badcol, 'resolution': opts.reso }, out, HIGHEST_PROTOCOL) out.close() hic = HiC_data(matrix, size_mat, dict_sec=dict_sec, chromosomes=chroms, masked=masked, resolution=opts.reso) #from pytadbit.mapping.analyze import hic_map #hic_map(hic, normalized=False, focus='chr1', show=True, cmap='viridis') printime('Creating BAM file') outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) total_counts = create_BAMhic(hic, opts.cpus, outbam, chroms_gen, opts.reso, samtools=opts.samtools) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, total_counts, size_mat, bias_file, len(badcol), outbam + '.bam', launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v != 'raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict( zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemeted for ' 'matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1, p + opts.reso) for r, reg in enumerate(regions) for p in range( starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer( opts.reso).replace(' ', ''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: cmap = plt.get_cmap(opts.cmap) if norm != 'raw': cmap.set_bad('grey', 1.) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace( ' ', ''), ('_' + param_hash), opts.format) out_plots[norm_string] = path.join(outdir, fnam) if opts.interactive: _ = plt.figure(figsize=(8, 7)) else: _ = plt.figure(figsize=(16, 14)) # ax1 = plt.subplot(111) ax1 = plt.axes([0.1, 0.1, 0.7, 0.8]) ax2 = plt.axes([0.82, 0.1, 0.07, 0.8]) matrix = array([ array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2) ]) mini = np_min(matrix[nonzero(matrix)]) / 2. matrix[matrix == 0] = mini m = zeros_like(matrix) for bad1 in bads1: m[:, bad1] = 1 for bad2 in bads2: m[bad2, :] = 1 matrix = log2(ma.masked_array(matrix, m)) ax1.imshow(matrix, interpolation='None', origin='lower', cmap=cmap, vmin=vmin, vmax=vmax) if len(regions) <= 2: pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = pltbeg1 if len( regions) == 1 else 0 if start2 is None else start2 pltend2 = pltend1 if len(regions) == 1 else sections[ regions[-1]] if end2 is None else end2 ax1.set_xlabel('{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1)) ax1.set_ylabel('{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)) def format_xticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg1) return nicer(tickstring if tickstring else 1, coma=True) def format_yticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg2) return nicer(tickstring if tickstring else 1, coma=True) ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks)) ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks)) labels = ax1.get_xticklabels() plt.setp(labels, rotation=-25, ha='left') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) else: vals = [0] keys = [''] for crm in regions: vals.append(section_pos[crm][0] / opts.reso) keys.append(crm) vals.append(section_pos[crm][1] / opts.reso) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xticks(vals) ax1.set_xticklabels('') ax1.set_xticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_xticklabels(keys, minor=True) for t in ax1.xaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xlabel('Chromosomes') ax1.set_ylabel('Chromosomes') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) data = [i for d in matrix for i in d if isfinite(i)] mindata = nanmin(data) maxdata = nanmax(data) gradient = linspace(maxdata, mindata, max((len(matrix), len(matrix[0])))) gradient = dstack((gradient, gradient))[0] h = ax2.hist(data, color='darkgrey', linewidth=2, orientation='horizontal', bins=50, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(0, max(h[0]), mindata, maxdata)) ax2.yaxis.tick_right() ax2.yaxis.set_label_position("right") ax2.set_xticks([]) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (name, norm, nicer(opts.reso))) ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90) ax2.set_xlabel('Count') if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update( write_matrix(mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s ' % tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def read_bam(inbam, filter_exclude, resolution, min_count=2500, sigma=2, ncpus=8, factor=1, outdir='.', check_sum=False): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] + 1 bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm + 1)]) start_bin = 0 end_bin = len(bins) + 1 total = len(bins) total = end_bin - start_bin + 1 regs = [] begs = [] ends = [] njobs = min(total, 100) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop at the right place nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime('\n - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append( pool.apply_async(read_bam_frag, args=( inbam, filter_exclude, bins, bins_dict, resolution, outdir, region, start, end, ))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS verbose = True cisprc = {} for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if verbose: if not countbin % 10 and countbin: sys.stdout.write(' ') if not countbin % 50 and countbin: sys.stdout.write(' %9s\n ' % ('%s/%s' % (countbin, len(regs)))) sys.stdout.write('.') sys.stdout.flush() fname = os.path.join(outdir, 'tmp_bins_%s:%d-%d.pickle' % (region, start, end)) tmp_cisprc = load(open(fname)) cisprc.update(tmp_cisprc) if verbose: print '%s %9s\n' % (' ' * (54 - (countbin % 50) - (countbin % 50) / 10), '%s/%s' % (len(regs), len(regs))) # out = open(os.path.join(outdir, 'dicos_%s.pickle' % ( # nicer(resolution).replace(' ', ''))), 'w') # dump(cisprc, out) # out.close() # bad columns def func_gen(x, *args): cmd = "zzz = " + func_restring % (args) exec(cmd) in globals(), locals() #print cmd try: return np.lib.asarray_chkfinite(zzz) except: # avoid the creation of NaNs when invalid values for power or log return x print ' - Removing columns with too few or too much interactions' if not min_count: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, savefig=os.path.join(outdir + 'filtered_bins_%s.png' % (nicer(resolution).replace(' ', '')))) else: print ' -> too few interactions defined as less than %9d interactions' % ( min_count) for k in cisprc: cisprc[k] = cisprc[k][1] badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, 0) < min_count: badcol[c] = cisprc.get(c, 0) countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) printime(' - Rescaling biases') size = len(bins) biases = [cisprc.get(k, 1.) for k in range(size)] mean_col = float(sum(biases)) / len(biases) biases = dict([(k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)]) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append(pool.apply_async(sum_nrm_matrix, args=( fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) # check the sum if check_sum: pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append( pool.apply_async(sum_nrm_matrix, args=( fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) print 'SUM:', sumnrm printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end)) procs.append( pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results sumdec = {} for proc in procs: for k, v in proc.get().iteritems(): try: sumdec[k] += v except KeyError: sumdec[k] = v # count the number of cells per diagonal # TODO: parallelize # find larget chromsome len_big = max(section_pos[c][1] - section_pos[c][0] for c in section_pos) # initialize dictionary ndiags = dict((k, 0) for k in xrange(len_big)) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set( ) # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b <= maxp: bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[dist] -= len(bad_diag) # chr_sizeerent behavior for longest diagonal: ndiags[0] += chr_size - len(thesebads) # normalize sum per diagonal by total number of cells in diagonal for k in sumdec: try: sumdec[k] /= ndiags[k] except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, sumdec, badcol
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception( 'ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception( 'ERROR: missing restriction enzyme name for oneD normalization' ) if not opts.mappability: raise Exception( 'ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % ( len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception( 'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception( "ERROR: chromosomes in FASTA different the ones in BAM") # get mappability ~2 min printime(' - Parsing mappability') fh = open(opts.mappability) mappability = dict((c, []) for c in refs) line = fh.next() crmM, begM, endM, val = line.split() crm = crmM if crmM not in mappability: print(' skipping %s' % crmM) while crmM not in mappability: line = fh.next() crmM, begM, endM, val = line.split() crm = crmM while any(not mappability[c] for c in mappability): for begB in xrange(0, len(genome[crmM]), opts.reso): endB = begB + opts.reso tmp = 0 try: while True: crmM, begM, endM, val = line.split() if crm != crmM: try: while crmM not in refs: line = fh.next() crmM, _ = line.split('\t', 1) except StopIteration: pass break begM = int(begM) endM = int(endM) if endM > endB: weight = endB - begM if weight >= 0: tmp += weight * float(val) break weight = endM - (begM if begM > begB else begB) if weight < 0: break tmp += weight * float(val) line = fh.next() except StopIteration: pass mappability[crm].append(tmp / opts.reso) crm = crmM mappability = reduce(lambda x, y: x + y, (mappability[c] for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos - 200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols) bad_col_image = path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % (opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print(' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join( outdir, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': opts.reso }, out) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def read_bam(inbam, filter_exclude, resolution, min_count=2500, biases_path='', normalization='Vanilla', mappability=None, n_rsites=None, cg_content=None, sigma=2, ncpus=8, factor=1, outdir='.', seed=1, extra_out='', only_valid=False, normalize_only=False, p_fit=None, max_njobs=100, min_perc=None, max_perc=None, extra_bads=None): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm)]) start_bin = 0 end_bin = len(bins) total = len(bins) regs = [] begs = [] ends = [] njobs = min(total, max_njobs) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: try: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] except IndexError: break if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime(' - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append(pool.apply_async( read_bam_frag, args=(inbam, filter_exclude, bins, bins_dict, resolution, outdir, extra_out, region, start, end,))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS cisprc = {} printime(' - Collecting cis and total interactions per bin (%d chunks)' % (len(regs))) stdout.write(' ') for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if not countbin % 10 and countbin: stdout.write(' ') if not countbin % 50 and countbin: stdout.write(' %9s\n ' % ('%s/%s' % (countbin , len(regs)))) stdout.write('.') stdout.flush() fname = path.join(outdir, 'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) tmp_cisprc = load(open(fname)) system('rm -f %s' % fname) cisprc.update(tmp_cisprc) stdout.write('\n') printime(' - Removing columns with too few or too much interactions') if len(bamfile.references) == 1 and min_count is None: raise Exception("ERROR: only one chromosome can't filter by " "cis-percentage, set min_count instead") elif min_count is None and len(bamfile.references) > 1: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, min_perc=min_perc, max_perc=max_perc, size=total, savefig=None) else: print (' -> too few interactions defined as less than %9d ' 'interactions') % (min_count) badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, [0, 0])[1] < min_count: badcol[c] = cisprc.get(c, [0, 0])[1] countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) # no mappability will result in NaNs, better to filter out these columns if mappability: badcol.update((i, True) for i, m in enumerate(mappability) if not m) # add manually columns to bad columns if extra_bads: removed_manually = 0 for ebc in extra_bads: c, ebc = ebc.split(':') b, e = map(int, ebc.split('-')) b = b / resolution + section_pos[c][0] e = e / resolution + section_pos[c][0] removed_manually += (e - b) badcol.update(dict((p, 'manual') for p in xrange(b, e))) printime(' - Removed %d columns manually.' % removed_manually) raw_cisprc = sum(float(cisprc[k][0]) / cisprc[k][1] for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol)) printime(' - Rescaling sum of interactions per bins') size = len(bins) biases = [float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1] for k in xrange(size)] if normalization == 'ICE': printime(' - ICE normalization') hic_data = load_hic_data_from_bam( inbam, resolution, filter_exclude=filter_exclude, tmpdir=outdir, ncpus=ncpus) hic_data.bads = badcol hic_data.normalize_hic(iterations=100, max_dev=0.000001) biases = hic_data.bias.copy() del(hic_data) elif normalization == 'Vanilla': printime(' - Vanilla normalization') mean_col = nanmean(biases) biases = dict((k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)) elif normalization == 'SQRT': printime(' - Vanilla-SQRT normalization') biases = [b**0.5 for b in biases] mean_col = nanmean(biases) biases = dict((k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)) elif normalization == 'oneD': printime(' - oneD normalization') if len(set([len(biases), len(mappability), len(n_rsites), len(cg_content)])) > 1: print "biases", "mappability", "n_rsites", "cg_content" print len(biases), len(mappability), len(n_rsites), len(cg_content) raise Exception('Error: not all arrays have the same size') tmp_oneD = path.join(outdir,'tmp_oneD_%s' % (extra_out)) mkdir(tmp_oneD) biases = oneD(tmp_dir=tmp_oneD, p_fit=p_fit, tot=biases, map=mappability, res=n_rsites, cg=cg_content, seed=seed) biases = dict((k, b) for k, b in enumerate(biases)) rmtree(tmp_oneD) elif normalization == 'custom': n_pos = 0 biases = {} print 'Using provided biases...' with open(biases_path, 'r') as r: r.next() for line in r: if line[0] == 'N': #b = float('nan') badcol[n_pos] = 0 biases[n_pos] = float('nan') else: b = float(line) if b == 0: badcol[n_pos] = 0 biases[n_pos] = float('nan') else: biases[n_pos] = b n_pos += 1 for add in range(max(biases.keys()), total + 1): biases[add] = float('nan') else: raise NotImplementedError('ERROR: method %s not implemented' % normalization) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') printime(' - Getting sum of normalized bins') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join(outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases,))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) if not normalize_only: printime(' - Computing Cis percentage') # Calculate Cis percentage pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join(outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(get_cis_perc, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results cis = total = 0 for proc in procs: c, t = proc.get() cis += c total += t norm_cisprc = float(cis) / total print ' * Cis-percentage: %.1f%%' % (norm_cisprc * 100) else: norm_cisprc = 0. printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join(outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results nrmdec = {} rawdec = {} for proc in procs: tmpnrm, tmpraw = proc.get() for c, d in tmpnrm.iteritems(): for k, v in d.iteritems(): try: nrmdec[c][k] += v rawdec[c][k] += tmpraw[c][k] except KeyError: try: nrmdec[c][k] = v rawdec[c][k] = tmpraw[c][k] except KeyError: nrmdec[c] = {k: v} rawdec[c] = {k: tmpraw[c][k]} # count the number of cells per diagonal # TODO: parallelize # find largest chromosome len_crms = dict((c, section_pos[c][1] - section_pos[c][0]) for c in section_pos) # initialize dictionary ndiags = dict((c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[crm][dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set() # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b < maxp: # not inclusive!! bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[crm][dist] -= len(bad_diag) # different behavior for longest diagonal: ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr for b in thesebads) # normalize sum per diagonal by total number of cells in diagonal signal_to_noise = 0.05 min_n = signal_to_noise ** -2. # equals 400 when default for crm in sections: if not crm in nrmdec: nrmdec[crm] = {} rawdec[crm] = {} tmpdec = 0 # store count by diagonal tmpsum = 0 # store count by diagonal ndiag = 0 val = 0 previous = [] # store diagonals to be summed in case not reaching the minimum for k in ndiags[crm]: tmpdec += nrmdec[crm].get(k, 0.) tmpsum += rawdec[crm].get(k, 0.) previous.append(k) if tmpsum > min_n: ndiag = sum(ndiags[crm][k] for k in previous) val = tmpdec # backup of tmpdec kept for last ones outside the loop try: ratio = val / ndiag for l in previous: nrmdec[crm][l] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass previous = [] tmpdec = 0 tmpsum = 0 # last ones we average with previous result if len(previous) == len(ndiags[crm]): nrmdec[crm] = {} elif tmpsum < min_n: ndiag += sum(ndiags[crm][k] for k in previous) val += tmpdec try: ratio = val / ndiag for k in previous: nrmdec[crm][k] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1) if len(mappability[c]) < len(refs) / opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) / opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % ( nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) samtools = which(opts.samtools) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bam1: mreads1 = path.realpath(opts.bam1) biases1 = opts.biases1 else: biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) try: biases1 = path.join(opts.workdir1, biases1) except AttributeError: biases1 = None if opts.bam2: mreads2 = path.realpath(opts.bam2) biases2 = opts.biases2 else: biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) try: biases2 = path.join(opts.workdir2, biases2) except AttributeError: biases2 = None filter_exclude = opts.filter if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: printime(' - loading first sample %s' % (mreads1)) hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) printime(' - loading second sample %s' % (mreads2)) hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) printime(' - comparing experiments') printime(' => correlation between equidistant loci') corr, _, scc, std, bads = correlate_matrices( hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print ' - correlation score (SCC): %.4f (+- %.7f)' % (scc, std) printime(' => correlation between eigenvectors') eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) printime(' => reproducibility score') reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm, verbose=False, remove_bad_columns=True) print ' - reproducibility score: %.4f' % (reprod) ncols = len(hic_data1) else: ncols = 0 decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' corr = eig_corr = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) printime(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2)) printime(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (opts.cpus, outbam)) else: system(samtools + ' index %s' % (outbam)) finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), ncols, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr, biases1, biases2, launch_time, finish_time) printime('\nDone.')
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception( 'ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception( 'ERROR: missing restriction enzyme name for oneD normalization' ) if not opts.mappability: raise Exception( 'ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % ( len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception( 'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs) == 1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1) if len(mappability[c]) < len(refs) / opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) / opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos - 200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) bad_col_image = path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % (opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print(' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join( outdir, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': opts.reso }, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def read_bam(inbam, filter_exclude, resolution, min_count=2500, normalization='Vanilla', mappability=None, n_rsites=None, cg_content=None, sigma=2, ncpus=8, factor=1, outdir='.', extra_out='', only_valid=False, normalize_only=False, max_njobs=100, min_perc=None, max_perc=None, extra_bads=None): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm)]) start_bin = 0 end_bin = len(bins) total = len(bins) regs = [] begs = [] ends = [] njobs = min(total, max_njobs) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: try: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] except IndexError: break if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime(' - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append( pool.apply_async(read_bam_frag, args=( inbam, filter_exclude, bins, bins_dict, resolution, outdir, extra_out, region, start, end, ))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS cisprc = {} printime(' - Collecting cis and total interactions per bin (%d chunks)' % (len(regs))) stdout.write(' ') for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if not countbin % 10 and countbin: stdout.write(' ') if not countbin % 50 and countbin: stdout.write(' %9s\n ' % ('%s/%s' % (countbin, len(regs)))) stdout.write('.') stdout.flush() fname = path.join( outdir, 'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) tmp_cisprc = load(open(fname)) system('rm -f %s' % fname) cisprc.update(tmp_cisprc) stdout.write('\n') printime(' - Removing columns with too few or too much interactions') if len(bamfile.references) == 1 and min_count is None: raise Exception("ERROR: only one chromosome can't filter by " "cis-percentage, set min_count instead") elif min_count is None and len(bamfile.references) > 1: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, min_perc=min_perc, max_perc=max_perc, size=total, savefig=path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(resolution).replace(' ', ''), extra_out))) else: print( ' -> too few interactions defined as less than %9d ' 'interactions') % (min_count) badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, [0, 0])[1] < min_count: badcol[c] = cisprc.get(c, [0, 0])[1] countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) # no mappability will result in NaNs, better to filter out these columns if mappability: badcol.update((i, True) for i, m in enumerate(mappability) if not m) # add manually columns to bad columns if extra_bads: removed_manually = 0 for ebc in extra_bads: c, ebc = ebc.split(':') b, e = map(int, ebc.split('-')) b = b / resolution + section_pos[c][0] e = e / resolution + section_pos[c][0] removed_manually += (e - b) badcol.update(dict((p, 'manual') for p in xrange(b, e))) printime(' - Removed %d columns manually.' % removed_manually) raw_cisprc = sum( float(cisprc[k][0]) / cisprc[k][1] for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol)) printime(' - Rescaling sum of interactions per bins') size = len(bins) biases = [ float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1] for k in xrange(size) ] if normalization == 'Vanilla': printime(' - Vanilla normalization') mean_col = nanmean(biases) biases = dict( (k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)) elif normalization == 'oneD': printime(' - oneD normalization') if len( set([ len(biases), len(mappability), len(n_rsites), len(cg_content) ])) > 1: print "biases", "mappability", "n_rsites", "cg_content" print len(biases), len(mappability), len(n_rsites), len(cg_content) raise Exception('Error: not all arrays have the same size') tmp_oneD = path.join(outdir, 'tmp_oneD_%s' % (extra_out)) mkdir(tmp_oneD) biases = oneD(tmp_dir=tmp_oneD, tot=biases, map=mappability, res=n_rsites, cg=cg_content) biases = dict((k, b) for k, b in enumerate(biases)) rmtree(tmp_oneD) else: raise NotImplementedError('ERROR: method %s not implemented' % normalization) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') printime(' - Getting sum of normalized bins') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(sum_nrm_matrix, args=( fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) if not normalize_only: printime(' - Computing Cis percentage') # Calculate Cis percentage pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append( pool.apply_async(get_cis_perc, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results cis = total = 0 for proc in procs: c, t = proc.get() cis += c total += t norm_cisprc = float(cis) / total print ' * Cis-percentage: %.1f%%' % (norm_cisprc * 100) else: norm_cisprc = 0. printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append( pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results nrmdec = {} rawdec = {} for proc in procs: tmpnrm, tmpraw = proc.get() for c, d in tmpnrm.iteritems(): for k, v in d.iteritems(): try: nrmdec[c][k] += v rawdec[c][k] += tmpraw[c][k] except KeyError: try: nrmdec[c][k] = v rawdec[c][k] = tmpraw[c][k] except KeyError: nrmdec[c] = {k: v} rawdec[c] = {k: tmpraw[c][k]} # count the number of cells per diagonal # TODO: parallelize # find largest chromosome len_crms = dict( (c, section_pos[c][1] - section_pos[c][0]) for c in section_pos) # initialize dictionary ndiags = dict( (c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[crm][dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set( ) # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b < maxp: # not inclusive!! bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[crm][dist] -= len(bad_diag) # different behavior for longest diagonal: ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr for b in thesebads) # normalize sum per diagonal by total number of cells in diagonal signal_to_noise = 0.05 min_n = signal_to_noise**-2. # equals 400 when default for crm in sections: if not crm in nrmdec: nrmdec[crm] = {} rawdec[crm] = {} tmpdec = 0 # store count by diagonal tmpsum = 0 # store count by diagonal ndiag = 0 val = 0 previous = [ ] # store diagonals to be summed in case not reaching the minimum for k in ndiags[crm]: tmpdec += nrmdec[crm].get(k, 0.) tmpsum += rawdec[crm].get(k, 0.) previous.append(k) if tmpsum > min_n: ndiag = sum(ndiags[crm][k] for k in previous) val = tmpdec # backup of tmpdec kept for last ones outside the loop try: ratio = val / ndiag for k in previous: nrmdec[crm][k] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass previous = [] tmpdec = 0 tmpsum = 0 # last ones we average with previous result if len(previous) == len(ndiags[crm]): nrmdec[crm] = {} elif tmpsum < min_n: ndiag += sum(ndiags[crm][k] for k in previous) val += tmpdec try: ratio = val / ndiag for k in previous: nrmdec[crm][k] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
def run(opts): check_options(opts) param_hash = digest_parameters(opts, extra=['quiet']) opts.normalizations = ['norm' if opts.norm else 'raw'] biases = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and opts.norm: raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of %s genome\n' % ('partial' if opts.chr_name else 'full')) norm = 'norm' if opts.norm else 'raw' if opts.format == 'matrix' or opts.format == 'hic': bamfile = AlignmentFile(mreads, 'rb') bam_refs = bamfile.references bam_lengths = bamfile.lengths if opts.chr_name: bam_refs_idx = [ bam_refs.index(chr_ord) for chr_ord in opts.chr_name if chr_ord in bam_refs ] if not bam_refs_idx: raise Exception( '''ERROR: Wrong number of chromosomes in chr_order. Found %s in bam file \n''' % (' '.join(bam_refs))) bam_refs = [ bam_ref for bam_ref in [bam_refs[bam_ref_idx] for bam_ref_idx in bam_refs_idx] ] bam_lengths = [ bam_len for bam_len in [bam_lengths[bam_ref_idx] for bam_ref_idx in bam_refs_idx] ] sections = OrderedDict(list(zip(bam_refs, [x for x in bam_lengths]))) printime('Getting %s matrices' % norm) matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases, 'rb')) if biases and norm != 'raw' else None, normalization=norm, filter_exclude=opts.filter, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean, chr_order=opts.chr_name) b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.format == 'matrix': if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1, p + opts.reso) for r, reg in enumerate(regions) for p in range( starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) printime(' - Writing: %s' % norm) out = open(opts.out, 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (next(row_names)) + '\t'.join( str(matrix.get((i, j), 0)) for i in range(b1, e1)) for j in range(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join( str(matrix.get((i, j), 0)) for i in range(b1, e1)) for j in range(b2, e2)) + '\n') out.close() else: printime(' - Writing: %s' % norm) tmp_chromsize = path.join(tmpdir, 'hic_%s.chrom.sizes' % param_hash) out = open(tmp_chromsize, 'w') for reg in regions: out.write('%s\t%d\n' % (reg, sections[reg])) out.close() tmpfl = path.join(tmpdir, 'hic_export_%s.tsv' % param_hash) out = open(tmpfl, 'w') out_ln = '0\t%s\t%d\t0\t1\t%s\t%d\t1\t1%f' if opts.norm else '0\t%s\t%d\t0\t1\t%s\t%d\t1\t1%d' if region1: starts = [start1, start2] ends = [end1, end2] row_names = [ (reg, pos + 1) for r, reg in enumerate(regions) for pos in range( starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso) ] out.write('\n'.join( out_ln % (row_names[i][0], row_names[i][1], row_names[j][0], row_names[j][1], matrix.get((i, j), 0)) for i in range(b1, e1) for j in range(i, e2))) else: totals = OrderedDict() total_num = 0 for c in sections: totals[c] = (total_num, total_num + sections[c] // opts.reso + 1) total_num += sections[c] // opts.reso + 1 for crm1_id, crm1 in enumerate(sections): b1, e1 = totals[crm1] row_names1 = dict((b1 + ipos, pos + 1) for ipos, pos in enumerate( range(0, sections[crm1], opts.reso))) for crm2 in list(sections.keys())[crm1_id:]: b2, e2 = totals[crm2] row_names2 = dict( (b2 + ipos, pos + 1) for ipos, pos in enumerate( range(0, sections[crm2], opts.reso))) out.write('\n'.join( out_ln % (crm1, row_names1[i], crm2, row_names2[j], matrix.get((i, j), 0)) for i in range(b1, e1) for j in range(max(b2, i), e2))) out.close() do_norm = '-n' if opts.norm else '' _ = Popen('java -Xmx32g -jar %s pre -j %d %s %s %s %s' % (opts.juicerjar, opts.cpus, do_norm, tmpfl, opts.out, tmp_chromsize), shell=True, universal_newlines=True).communicate() elif opts.format == 'text': printime('Getting and writing matrix to text format') fnames = write_matrix(mreads, opts.reso, load(open(biases, 'rb')) if biases else None, outdir, filter_exclude=opts.filter, normalizations=[norm], region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, cooler=False, clean=clean, chr_order=opts.chr_name) rename(list(fnames.values())[0], opts.out) elif opts.format == 'cooler': printime('Getting and writing matrix to cooler format') fnames = write_matrix(mreads, opts.reso, load(open(biases, 'rb')) if biases else None, outdir, filter_exclude=opts.filter, normalizations=[norm], region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, cooler=True, clean=clean, chr_order=opts.chr_name) for zoom_c in ZOOMS_COOLER: if opts.reso >= zoom_c: continue if start1 is not None and end1: if end1 - start1 < zoom_c: continue if start2 is not None and end2: if end2 - start2 < zoom_c: continue printime('Building cooler zoom %d' % zoom_c) _ = write_matrix(mreads, zoom_c, None, outdir, filter_exclude=opts.filter, normalizations=['raw'], region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, cooler=True, cooler_name=fnames['NRM' if opts.norm else 'RAW'], clean=clean, chr_order=opts.chr_name) rename(fnames['NRM' if opts.norm else 'RAW'], opts.out) if 'NRM' in fnames and not opts.norm: remove(fnames['NRM']) if 'RAW' in fnames and opts.norm: remove(fnames['RAW']) if clean: printime('Cleaning') system('rm -rf %s ' % tmpdir)