def run(opts): check_options(opts) launch_time = time.localtime() # prepare output folders mkdir(path.join(opts.workdir, '06_model')) outdir = path.join(opts.workdir, '06_model', 'chr%s_%s-%s' % (opts.crm, opts.beg, opts.end)) mkdir(outdir) # load data crm = load_hic_data(opts) exp = crm.experiments[0] opts.beg, opts.end = opts.beg or 1, opts.end or exp.size # in case we are not going to run if opts.job_list: job_file_handler = open(path.join(outdir, 'job_list.q'), 'w') else: job_file_handler = None # optimization if opts.optimize: optimization(exp, opts, job_file_handler, outdir) finish_time = time.localtime() return # correlate all optimizations and get best set of parqameters optpar, dcutoff = correlate_models(opts, outdir, exp) # run good mmodels big_run(exp, opts, job_file_handler, outdir, optpar) finish_time = time.localtime()
def check_options(opts): # do the division to bins try: opts.beg = int(float(opts.beg) / opts.reso) opts.end = int(float(opts.end) / opts.reso) if opts.end - opts.beg <= 2: raise Exception('"beg" and "end" parameter should be given in ' + 'genomic coordinates, not bin') except TypeError: pass # turn options into lists opts.scale = (tuple(arange(*[float(s) for s in opts.scale.split(':') ])) if ':' in opts.scale else [float(opts.scale )]) opts.maxdist = (tuple(range (*[int (i) for i in opts.maxdist.split(':')])) if ':' in opts.maxdist else [int (opts.maxdist)]) opts.upfreq = (tuple(arange(*[float(i) for i in opts.upfreq.split(':') ])) if ':' in opts.upfreq else [float(opts.upfreq )]) opts.lowfreq = (tuple(arange(*[float(i) for i in opts.lowfreq.split(':')])) if ':' in opts.lowfreq else [float(opts.lowfreq)]) opts.dcutoff = (tuple(arange(*[float(i) for i in opts.dcutoff.split(':')])) if ':' in opts.dcutoff else [float(opts.dcutoff)]) opts.nmodels_run = opts.nmodels_run or opts.nmodels opts.matrix = path.abspath(opts.matrix) opts.workdir = path.abspath(opts.workdir) mkdir(opts.workdir)
def optimization(exp, opts, job_file_handler, outdir): models = compile_models(opts, outdir) print 'Optimizing parameters...' print ('# %3s %6s %7s %7s %6s\n' % ( "num", "upfrq", "lowfrq", "maxdist", "scale")) for m, u, l, s in product(opts.maxdist, opts.upfreq, opts.lowfreq, opts.scale): muls = tuple(map(my_round, (m, u, l, s))) if muls in models: print('%5s %6s %7s %7s %6s ' % ('x', u, l, m, s)) continue elif opts.job_list: print('%5s %6s %7s %7s %6s ' % ('o', u, l, m, s)) else: print('%5s %6s %7s %7s %6s ' % ('-', u, l, m, s)) mkdir(path.join(outdir, 'cfg_%s_%s_%s_%s' % muls)) # write list of jobs to be run separately if opts.job_list: for rand in xrange(1, opts.nmodels + 1, opts.nmodels_run): write_one_job(opts, rand, m, u, l, s, job_file_handler) continue # compute models try: run_batch_job(exp, opts, m, u, l, s, outdir) except TADbitModelingOutOfBound: warn('WARNING: scale (here %s) x resolution (here %d) should be ' 'lower than maxdist (here %d instead of at least: %d)' % ( s, opts.reso, m, s * opts.reso)) continue if opts.job_list: job_file_handler.close()
def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter min_count = opts.min_count ncpus = opts.cpus factor = 1 outdir = opts.outdir sigma = 2 mkdir(outdir) sys.stdout.write('\nNormalization of full genome\n') biases, decay, badcol = read_bam(inbam, filter_exclude, resolution, min_count=min_count, ncpus=ncpus, sigma=sigma, factor=factor, outdir=outdir, check_sum=opts.check_sum) printime(' - Saving biases and badcol columns') # biases out = open(os.path.join(outdir, 'biases_%s.pickle' % ( nicer(resolution).replace(' ', ''))), 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': resolution}, out) out.close() # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=()) printime('\nDone.')
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes( reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print (' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print (' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) # enlighten plotting parameter writing if opts.only_plot: opts.plot = True if opts.interactive: if opts.nox: raise Exception('ERROR: no screen no fun.\n' 'Interactive plot incompatible with noX option.') opts.plot = True opts.only_plot = True # check resume if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') # check resume if opts.triangular and opts.coord2: raise NotImplementedError('ERROR: triangular is only available for ' 'symmetric matrices.') # for LUSTRE file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check if job already run using md5 digestion of parameters try: if already_run(opts): if not opts.force: if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit('WARNING: exact same job already computed, see JOBs table above') else: warn('WARNING: exact same job already computed, overwriting...') except IOError: warn(("" "\nWARNING:\n new working directory created. It's ok... " "but next time use TADbit from the beginning!! :)"))
def check_options(opts): mkdir(opts.workdir) # create empty DB if don't exists dbpath = path.join(opts.workdir, 'trace.db') open(dbpath, 'a').close() # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass if opts.workdir1: # tmp file dbfile1 = 'trace1_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb1 = path.join(dbdir, dbfile1) try: copyfile(path.join(opts.workdir1, 'trace.db'), opts.tmpdb1) except IOError: pass if opts.workdir2: # tmp file dbfile2 = 'trace2_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb2 = path.join(dbdir, dbfile2) try: copyfile(path.join(opts.workdir2, 'trace.db'), opts.tmpdb2) except IOError: pass else: if opts.workdir1: opts.tmpdb1 = path.join(opts.workdir1, 'trace.db') if opts.workdir2: opts.tmpdb2 = path.join(opts.workdir2, 'trace.db') # resolution needed to compare if not opts.skip_comparison and not opts.reso: raise Exception('ERROR: need to define resolution at which to compare') # check if job already run using md5 digestion of parameters if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) if opts.workdir1: remove(path.join(dbdir, dbfile1)) if opts.workdir2: remove(path.join(dbdir, dbfile2)) exit('WARNING: exact same job already computed, see JOBs table above')
def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) # enlight plotting parameter writing if opts.only_plot: opts.plot = True if opts.interactive: opts.plot = True opts.only_plot = True # check resume if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check if job already run using md5 digestion of parameters if already_run(opts): if not opts.force: if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit( 'WARNING: exact same job already computed, see JOBs table above' ) else: warn('WARNING: exact same job already computed, overwritting...')
def check_options(opts): mkdir(opts.workdir) # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass if opts.workdir1: # tmp file dbfile1 = 'trace1_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb1 = path.join(dbdir, dbfile1) try: copyfile(path.join(opts.workdir1, 'trace.db'), opts.tmpdb1) except IOError: pass if opts.workdir2: # tmp file dbfile2 = 'trace2_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb2 = path.join(dbdir, dbfile2) try: copyfile(path.join(opts.workdir2, 'trace.db'), opts.tmpdb2) except IOError: pass else: opts.tmpdb1 = path.join(opts.workdir1, 'trace.db') opts.tmpdb2 = path.join(opts.workdir2, 'trace.db') # check if job already run using md5 digestion of parameters if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) if opts.workdir1: remove(path.join(dbdir, dbfile1)) if opts.workdir2: remove(path.join(dbdir, dbfile2)) exit('WARNING: exact same job already computed, see JOBs table above')
def run(opts): check_options(opts) launch_time = time.localtime() # prepare output folders mkdir(path.join(opts.workdir, '06_model')) outdir = path.join(opts.workdir, '06_model', 'chr%s_%s-%s' % (opts.crm, opts.beg, opts.end)) mkdir(outdir) # load data if opts.matrix: crm = load_hic_data(opts) else: (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases)) exp = crm.experiments[0] opts.beg, opts.end = opts.beg or 1, opts.end or exp.size # in case we are not going to run if opts.job_list: job_file_handler = open(path.join(outdir, 'job_list.q'), 'w') else: job_file_handler = None # optimization if opts.optimize: optimization(exp, opts, job_file_handler, outdir) finish_time = time.localtime() return # correlate all optimizations and get best set of parqameters optpar, dcutoff = correlate_models(opts, outdir, exp) # run good mmodels big_run(exp, opts, job_file_handler, outdir, optpar) finish_time = time.localtime()
def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter min_count = opts.min_count ncpus = opts.cpus factor = 1 outdir = opts.outdir sigma = 2 mkdir(outdir) sys.stdout.write('\nNormalization of full genome\n') biases, decay, badcol = read_bam(inbam, filter_exclude, resolution, min_count=min_count, ncpus=ncpus, sigma=sigma, factor=factor, outdir=outdir, check_sum=opts.check_sum) printime(' - Saving biases and badcol columns') # biases out = open( os.path.join(outdir, 'biases_%s.pickle' % (nicer(resolution).replace(' ', ''))), 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': resolution }, out) out.close() # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=()) printime('\nDone.')
def check_options(opts): # check resume if not path.exists(opts.workdir): raise IOError('ERROR: wordir not found.') # do the division to bins try: opts.beg = int(float(opts.beg) / opts.reso) opts.end = int(float(opts.end) / opts.reso) if opts.end - opts.beg <= 2: raise Exception('"beg" and "end" parameter should be given in ' + 'genomic coordinates, not bin') except TypeError: pass # turn options into lists opts.scale = (tuple(arange(*[float(s) for s in opts.scale.split(':') ])) if ':' in opts.scale else [float(opts.scale )]) opts.maxdist = (tuple(range (*[int (i) for i in opts.maxdist.split(':')])) if ':' in opts.maxdist else [int (opts.maxdist)]) opts.upfreq = (tuple(arange(*[float(i) for i in opts.upfreq.split(':') ])) if ':' in opts.upfreq else [float(opts.upfreq )]) opts.lowfreq = (tuple(arange(*[float(i) for i in opts.lowfreq.split(':')])) if ':' in opts.lowfreq else [float(opts.lowfreq)]) opts.dcutoff = (tuple(arange(*[float(i) for i in opts.dcutoff.split(':')])) if ':' in opts.dcutoff else [float(opts.dcutoff)]) opts.nmodels_run = opts.nmodels_run or opts.nmodels if opts.matrix: opts.matrix = path.abspath(opts.matrix) opts.workdir = path.abspath(opts.workdir) mkdir(opts.workdir) if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile)
def check_options(opts): mkdir(opts.workdir) # transform filtering reads option opts.filter = filters_to_bin(opts.filter) # check custom normalization if opts.normalization=='custom': if not opts.biases_path: raise IOError('ERROR: biases file required for "custom" normalization.') elif not path.exists(opts.biases_path): raise IOError('ERROR: biases not found at path: %s' % opts.biases_path) # check resume if not path.exists(opts.workdir): raise IOError('ERROR: workdir not found.') # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check if job already run using md5 digestion of parameters try: if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit('WARNING: exact same job already computed, see JOBs table above') except IOError: # new working directory pass
def check_options(opts): mkdir(opts.workdir) # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass if opts.workdir1: # tmp file dbfile1 = 'trace1_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb1 = path.join(dbdir, dbfile1) try: copyfile(path.join(opts.workdir1, 'trace.db'), opts.tmpdb1) except IOError: pass if opts.workdir2: # tmp file dbfile2 = 'trace2_%s' % (''.join([ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb2 = path.join(dbdir, dbfile2) try: copyfile(path.join(opts.workdir2, 'trace.db'), opts.tmpdb2) except IOError: pass # check if job already run using md5 digestion of parameters if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) if opts.workdir1: remove(path.join(dbdir, dbfile1)) if opts.workdir2: remove(path.join(dbdir, dbfile2)) exit('WARNING: exact same job already computed, see JOBs table above')
def check_options(opts): if not path.exists(opts.workdir): mkdir(opts.workdir) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() mkdir(path.join(opts.workdir, '03_filtered_reads')) # create empty DB if don't exists dbpath = path.join(opts.workdir, 'trace.db') open(dbpath, 'a').close() # for LUSTRE file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count())
def write_matrix(inbam, resolution, biases, outfile, filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10), region1=None, start1=None, end1=None, clean=True, region2=None, start2=None, end2=None, nchunks=100, tmpdir='.', ncpus=8, verbose=True, window=None): if not isinstance(filter_exclude, int): filter_exclude = filters_to_bin(filter_exclude) _, rand_hash, bin_coords, chunks = read_bam(inbam, filter_exclude, resolution, ncpus=ncpus, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, nchunks=nchunks, verbose=verbose) bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] if biases: bias1, bias2, decay, bads1, bads2 = get_biases_region( biases, bin_coords) transform = lambda x, c, j, k: x / bias1[j] / bias2[k] / decay[c][abs( k - j)] transform2 = lambda x, j, k: x / bias1[j] / bias2[k] else: bads1 = bads2 = {} transform = transform2 = lambda x, c, k, j: x if bads1 is bads2: badcols = bads1 else: # should never happen badcols = bads1 badcols.update(bads2) if verbose: printime(' - Writing matrices') mkdir(os.path.split(os.path.abspath(outfile))[0]) # write the rest of the file to be sorted out = open(outfile, 'w') nheader = 0 for i, c in enumerate(bamfile.references): out.write('# CHROM\t{}\t{}\n'.format(c, bamfile.lengths[i])) nheader += 1 out.write('# RESOLUTION\t{}\n'.format(resolution)) nheader += 1 out.write('# BADCOLS\t{}\n'.format(','.join(map(str, badcols.keys())))) nheader += 1 if window == 'all': outside = lambda c_, j_, k_: False elif window == 'intra': outside = lambda c_, j_, k_: c_ == '' elif window == 'inter': outside = lambda c_, j_, k_: c_ != '' else: min_, max_ = window outside = lambda c_, j_, k_: (k_ - j_) < min_ or (k_ - j_) > max_ # pull all sub-matrices and write full matrix for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash, verbose=verbose, clean=clean): if k < j or j in badcols or k in badcols: # we keep only half matrix continue if outside(c, j, k): continue try: n = transform(v, c, j, k) # normalize except KeyError: n = transform2(v, j, k) # normalize no decay out.write('{}\t{}\t{}\t{}\n'.format(j, k, v, n)) out.close() # this is the last thing we do in case something goes wrong if clean: os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))) return nheader
def read_bam(inbam, filter_exclude, resolution, min_count=2500, biases_path='', normalization='Vanilla', mappability=None, n_rsites=None, cg_content=None, sigma=2, ncpus=8, factor=1, outdir='.', seed=1, extra_out='', only_valid=False, normalize_only=False, p_fit=None, max_njobs=100, min_perc=None, max_perc=None, extra_bads=None): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm)]) start_bin = 0 end_bin = len(bins) total = len(bins) regs = [] begs = [] ends = [] njobs = min(total, max_njobs) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: try: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] except IndexError: break if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime(' - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append(pool.apply_async( read_bam_frag, args=(inbam, filter_exclude, bins, bins_dict, resolution, outdir, extra_out, region, start, end,))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS cisprc = {} printime(' - Collecting cis and total interactions per bin (%d chunks)' % (len(regs))) stdout.write(' ') for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if not countbin % 10 and countbin: stdout.write(' ') if not countbin % 50 and countbin: stdout.write(' %9s\n ' % ('%s/%s' % (countbin , len(regs)))) stdout.write('.') stdout.flush() fname = path.join(outdir, 'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) tmp_cisprc = load(open(fname)) system('rm -f %s' % fname) cisprc.update(tmp_cisprc) stdout.write('\n') printime(' - Removing columns with too few or too much interactions') if len(bamfile.references) == 1 and min_count is None: raise Exception("ERROR: only one chromosome can't filter by " "cis-percentage, set min_count instead") elif min_count is None and len(bamfile.references) > 1: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, min_perc=min_perc, max_perc=max_perc, size=total, savefig=None) else: print (' -> too few interactions defined as less than %9d ' 'interactions') % (min_count) badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, [0, 0])[1] < min_count: badcol[c] = cisprc.get(c, [0, 0])[1] countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) # no mappability will result in NaNs, better to filter out these columns if mappability: badcol.update((i, True) for i, m in enumerate(mappability) if not m) # add manually columns to bad columns if extra_bads: removed_manually = 0 for ebc in extra_bads: c, ebc = ebc.split(':') b, e = map(int, ebc.split('-')) b = b / resolution + section_pos[c][0] e = e / resolution + section_pos[c][0] removed_manually += (e - b) badcol.update(dict((p, 'manual') for p in xrange(b, e))) printime(' - Removed %d columns manually.' % removed_manually) raw_cisprc = sum(float(cisprc[k][0]) / cisprc[k][1] for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol)) printime(' - Rescaling sum of interactions per bins') size = len(bins) biases = [float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1] for k in xrange(size)] if normalization == 'ICE': printime(' - ICE normalization') hic_data = load_hic_data_from_bam( inbam, resolution, filter_exclude=filter_exclude, tmpdir=outdir, ncpus=ncpus) hic_data.bads = badcol hic_data.normalize_hic(iterations=100, max_dev=0.000001) biases = hic_data.bias.copy() del(hic_data) elif normalization == 'Vanilla': printime(' - Vanilla normalization') mean_col = nanmean(biases) biases = dict((k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)) elif normalization == 'SQRT': printime(' - Vanilla-SQRT normalization') biases = [b**0.5 for b in biases] mean_col = nanmean(biases) biases = dict((k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)) elif normalization == 'oneD': printime(' - oneD normalization') if len(set([len(biases), len(mappability), len(n_rsites), len(cg_content)])) > 1: print "biases", "mappability", "n_rsites", "cg_content" print len(biases), len(mappability), len(n_rsites), len(cg_content) raise Exception('Error: not all arrays have the same size') tmp_oneD = path.join(outdir,'tmp_oneD_%s' % (extra_out)) mkdir(tmp_oneD) biases = oneD(tmp_dir=tmp_oneD, p_fit=p_fit, tot=biases, map=mappability, res=n_rsites, cg=cg_content, seed=seed) biases = dict((k, b) for k, b in enumerate(biases)) rmtree(tmp_oneD) elif normalization == 'custom': n_pos = 0 biases = {} print 'Using provided biases...' with open(biases_path, 'r') as r: r.next() for line in r: if line[0] == 'N': #b = float('nan') badcol[n_pos] = 0 biases[n_pos] = float('nan') else: b = float(line) if b == 0: badcol[n_pos] = 0 biases[n_pos] = float('nan') else: biases[n_pos] = b n_pos += 1 for add in range(max(biases.keys()), total + 1): biases[add] = float('nan') else: raise NotImplementedError('ERROR: method %s not implemented' % normalization) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') printime(' - Getting sum of normalized bins') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join(outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases,))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) if not normalize_only: printime(' - Computing Cis percentage') # Calculate Cis percentage pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join(outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(get_cis_perc, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results cis = total = 0 for proc in procs: c, t = proc.get() cis += c total += t norm_cisprc = float(cis) / total print ' * Cis-percentage: %.1f%%' % (norm_cisprc * 100) else: norm_cisprc = 0. printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join(outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results nrmdec = {} rawdec = {} for proc in procs: tmpnrm, tmpraw = proc.get() for c, d in tmpnrm.iteritems(): for k, v in d.iteritems(): try: nrmdec[c][k] += v rawdec[c][k] += tmpraw[c][k] except KeyError: try: nrmdec[c][k] = v rawdec[c][k] = tmpraw[c][k] except KeyError: nrmdec[c] = {k: v} rawdec[c] = {k: tmpraw[c][k]} # count the number of cells per diagonal # TODO: parallelize # find largest chromosome len_crms = dict((c, section_pos[c][1] - section_pos[c][0]) for c in section_pos) # initialize dictionary ndiags = dict((c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[crm][dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set() # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b < maxp: # not inclusive!! bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[crm][dist] -= len(bad_diag) # different behavior for longest diagonal: ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr for b in thesebads) # normalize sum per diagonal by total number of cells in diagonal signal_to_noise = 0.05 min_n = signal_to_noise ** -2. # equals 400 when default for crm in sections: if not crm in nrmdec: nrmdec[crm] = {} rawdec[crm] = {} tmpdec = 0 # store count by diagonal tmpsum = 0 # store count by diagonal ndiag = 0 val = 0 previous = [] # store diagonals to be summed in case not reaching the minimum for k in ndiags[crm]: tmpdec += nrmdec[crm].get(k, 0.) tmpsum += rawdec[crm].get(k, 0.) previous.append(k) if tmpsum > min_n: ndiag = sum(ndiags[crm][k] for k in previous) val = tmpdec # backup of tmpdec kept for last ones outside the loop try: ratio = val / ndiag for l in previous: nrmdec[crm][l] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass previous = [] tmpdec = 0 tmpsum = 0 # last ones we average with previous result if len(previous) == len(ndiags[crm]): nrmdec[crm] = {} elif tmpsum < min_n: ndiag += sum(ndiags[crm][k] for k in previous) val += tmpdec try: ratio = val / ndiag for k in previous: nrmdec[crm][k] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
def check_options(opts): if not opts.workdir: raise Exception('ERROR: output option required.') if opts.type != 'map': raise NotImplementedError('ERROR: not yet there') if not opts.genome: raise Exception('ERROR: genome parameter required.') if not opts.workdir: raise Exception('ERROR: workdir parameter required.') # check skip if not path.exists(opts.workdir) and opts.skip: print('WARNING: can use output files, found, not skipping...') opts.skip = False if opts.workdir.endswith('/'): opts.workdir = opts.workdir[:-1] # write log newbie = False if not path.exists(opts.workdir): newbie = True mkdir(opts.workdir) log_format = '[PARSING] %(message)s' # reset logging logging.getLogger().handlers = [] try: print('Writing log to ' + path.join(opts.workdir, 'process.log')) logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='a+') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='a+') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # check if job already run using md5 digestion of parameters try: if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit( 'WARNING: exact same job already computed, see JOBs table above' ) except OSError: pass
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print( ' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print( ' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=0.001, max_frag_size=100000, min_frag_size=50, re_proximity=5, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if not opts.nosql: (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) # store path ids to be saved in database inputs = bad_co_id, biases_id, mreads_id else: bad_co = opts.bad_co biases = opts.biases mreads = opts.mreads reso = opts.reso mreads = path.join(opts.workdir, mreads) bad_co = path.join(opts.workdir, bad_co) biases = path.join(opts.workdir, biases) mkdir(path.join(opts.workdir, '05_segmentation')) print 'loading %s at resolution %s' % (mreads, nice(reso)) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases)) # compartments cmp_result = {} if not opts.only_tads: print 'Searching compartments' hic_data.find_compartments(crms=opts.crms) cmprt_dir = path.join(opts.workdir, '05_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) for crm in opts.crms or hic_data.chromosomes: cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) hic_data.write_compartments(cmprt_file, chroms=[crm]) cmp_result[crm] = {'path': cmprt_file, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '05_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) # maximum size of a TAD max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=True, max_tad_size=max_tad_size, no_heuristic=True) tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: save_to_db(opts, cmp_result, tad_result, reso, inputs, launch_time, finish_time)
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True, min_seq_len=15, windows=None, add_site=True, clean=False, get_nread=False, **kwargs): """ Maps FASTQ reads to an indexed reference genome. Mapping can be done either without knowledge of the restriction enzyme used, or for experiments performed without one, like Micro-C (iterative mapping), or using the ligation sites created from the digested ends (fragment-based mapping). :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to FASTQ file, either compressed or not. :param out_map_dir: path to a directory where to store mapped reads in MAP format . :param None r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. This is optional if frag_map option is False :param True frag_map: two step mapper, first full length is mapped, then remaining, unmapped reads, are divided into restriction-enzyme fragments andeach is mapped. :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. :param 15 min_seq_len: minimum size of a fragment to map :param None windows: tuple of ranges for beginning and end of the mapping. This parameter allows to do classical iterative mapping, e.g. windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50)) A unique window can also be passed, for trimming, like this: windows=((1,101),) :param False clean: remove intermediate files created in temp_dir :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param False get_nreads: returns a list of lists where each element contains a path and the number of reads processed :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.map_parser.parse_map` """ skip = kwargs.get('skip', False) suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) outfiles = [] temp_dir = os.path.abspath(os.path.expanduser( kwargs.get('temp_dir', gettempdir()))) # create directories for rep in [temp_dir, out_map_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name = os.path.split(fastq_path)[-1].replace('.gz', '') base_name = base_name.replace('.fastq', '') input_reads = fastq_path if windows is None: light_storage = True windows = (None, ) elif isinstance(windows[0], int): # if windows starts at zero we do not need to store all the sequence # otherwise we need it because sequence can be trimmed two times # in fragment based mapping light_storage = True if not windows[0] else False windows = [tuple(windows)] else: # ensure that each element is a tuple, not a list windows = [tuple(win) for win in windows] # in this case we will need to keep the information about original # sequence at any point, light storage is thus not possible. light_storage = False for win in windows: # Prepare the FASTQ file and iterate over them curr_map, counter = transform_fastq( input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], fastq=( input_reads.endswith('.fastq' ) or input_reads.endswith('.fastq.gz') or input_reads.endswith('.fq.gz' ) or input_reads.endswith('.dsrc' )), min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if input_reads != fastq_path and clean: print ' x removing original input %s' % input_reads os.system('rm -f %s' % (input_reads)) # First mapping, full length if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix) if end: print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix) else: print 'Mapping full reads...', curr_map if not skip: _gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs) # parse map file to extract not uniquely mapped reads print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % ( beg, end, suffix))) # clean if clean: print ' x removing GEM input %s' % curr_map os.system('rm -f %s' % (curr_map)) print ' x removing map %s' % out_map_path os.system('rm -f %s' % (out_map_path)) # for next round, we will use remaining unmapped reads input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix) outfiles.append( (os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter)) # map again splitting unmapped reads into RE fragments # (no need to trim this time) if frag_map: if not r_enz: raise Exception('ERROR: need enzyme name to fragment.') frag_map, counter = transform_fastq( input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz, add_site=add_site, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if clean: print ' x removing pre-GEM input %s' % input_reads os.system('rm -f %s' % (input_reads)) if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix) if not skip: print 'Mapping fragments of remaining reads...' _gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs) print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix), os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) # clean if clean: print ' x removing GEM input %s' % frag_map os.system('rm -f %s' % (frag_map)) print ' x removing failed to map ' + curr_map + '_fail%s.map' % (suffix) os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix))) print ' x removing tmp mapped %s' % out_map_path os.system('rm -f %s' % (out_map_path)) outfiles.append((os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter)) if get_nread: return outfiles return [out for out, _ in outfiles]
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids) renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of cPickle genome to make it faster genome = load(open(opts.genome[0])) except UnpicklingError: genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % ( read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) if opts.fast_fragment: reads = fname1 counts_multis = [ '#' in line.split('\t')[0] for line in open(reads) ] count = len(counts_multis) multiples = {} multiples[1] = sum( [count_mult for count_mult in counts_multis if count_mult]) del counts_multis else: # compute the intersection of the two read ends print('Getting intersection between read 1 and read 2') count, multiples = get_intersection(fname1, fname2, reads, compress=opts.compress_input) # compute insert size print('Get insert size...') hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) try: median, max_f, mad = fragment_size(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) except ZeroDivisionError: warn('WARNING: cannot compute fragment length, too few ' 'dangling-ends. Setting median length to 400 nt.') median = max_f = mad = 0 if median < 50: warn('WARNING: fragment length too short ({}). ' 'Setting median length to 400 nt.'.format(mad)) median, max_f, mad = 400, 100, 40 if opts.median: median = opts.median if opts.max_f: max_f = opts.max_f if opts.mad: mad = opts.mad print(' - median insert size =', median) print(' - median absolution of insert size =', mad) print( ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f) max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print(' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends' % max_mole) print(' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks' % min_dist) print("identify pairs to filter...") masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, strict_duplicates=opts.strict_duplicates, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) if opts.valid: infile = mreads else: infile = reads bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked, samtools=opts.samtools) finish_time = time.localtime() print(median, max_f, mad) # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)
def read_bam(inbam, filter_exclude, resolution, ncpus=8, region1=None, start1=None, end1=None, region2=None, start2=None, end2=None, nchunks=None, tmpdir='.', verbose=True, normalize=False, max_size=None): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict(zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) # get chromosomes and genome sizes total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] # define genomic bins bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm)]) if not bins: raise Exception('ERROR: Chromosome %s smaller than bin size\n' % (crm)) # define start, end position of region to grab start_bin1 = 0 end_bin1 = len(bins) + 1 regions = bamfile.references if region1: regions = [region1] if region2: regions.append(region2) else: total = len(bins) if start1 is not None or end1: raise Exception('ERROR: Cannot use start/end1 without region') if start1 is not None: start_bin1 = section_pos[region1][0] + start1 / resolution else: if region1: start_bin1 = section_pos[region1][0] else: start_bin1 = 0 start1 = 0 if end1 is not None: end_bin1 = section_pos[region1][0] + end1 / resolution else: if region1: end_bin1 = section_pos[region1][1] end1 = sections[region1] * resolution else: end_bin1 = total end1 = total * resolution # define chunks, using at most 100 sub-divisions of region1 total = end_bin1 - start_bin1 regs = [] begs = [] ends = [] if nchunks is None: njobs = min(total, 100) + 1 else: njobs = min(nchunks, 1) nbins = total / njobs + 1 for i in xrange(start_bin1, end_bin1, nbins): if i + nbins > end_bin1: # make sure that we stop at the right place nbins = end_bin1 - i try: (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, fin2) = bins[i], bins[-1] if crm1 != crm2: fin1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(fin1 * resolution + resolution) # last nt included ends.append(fin2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(fin2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # reduce dictionaries all_bins = [] seenbins = set() for crm in regions: beg_crm = section_pos[crm][0] if region1: start = start_bin1 - beg_crm end = end_bin1 - beg_crm else: start = 0 end = section_pos[crm][1] - section_pos[crm][0] all_bins.extend([(crm, i) for i in xrange(start, end) if not (crm, i) in seenbins]) seenbins = set(all_bins) del(seenbins) bins_dict1 = dict((j, i) for i, j in enumerate(all_bins)) if region2: if not region2 in section_pos: raise Exception('ERROR: chromosome %s not found' % region2) bins = [] beg_crm = section_pos[region2][0] if start2 is not None: start_bin2 = section_pos[region2][0] + start2 / resolution else: start_bin2 = section_pos[region2][0] start2 = 0 if end2 is not None: end_bin2 = section_pos[region2][0] + end2 / resolution else: end_bin2 = section_pos[region2][1] end2 = sections[region2] * resolution start = start_bin2 - beg_crm end = end_bin2 - beg_crm bins = [(region2, i) for i in xrange(start, end)] bins_dict2 = dict([(j, i) for i, j in enumerate(bins)]) else: start_bin2 = start_bin1 end_bin2 = end_bin1 bins_dict2 = bins_dict1 size1 = end_bin1 - start_bin1 size2 = end_bin2 - start_bin2 if verbose: printime('\n (Matrix size %dx%d)' % (size1, size2)) if max_size and max_size < size1 * size2: raise Exception(('ERROR: matrix too large ({0}x{1}) should be at most ' '{2}x{2}').format(size1, size2, int(max_size**0.5))) pool = mu.Pool(ncpus) # create random hash associated to the run: rand_hash = "%016x" % getrandbits(64) ## RUN! if verbose: printime('\n - Parsing BAM (%d chunks)' % (len(regs))) mkdir(os.path.join(tmpdir, '_tmp_%s' % (rand_hash))) # empty all_bins array if we are not going to normalize if not normalize: all_bins = [] procs = [] for i, (region, b, e) in enumerate(zip(regs, begs, ends)): if ncpus == 1: _read_bam_frag(inbam, filter_exclude, all_bins, bins_dict1, bins_dict2, rand_hash, resolution, tmpdir, region, b, e,) else: procs.append(pool.apply_async( _read_bam_frag, args=(inbam, filter_exclude, all_bins, bins_dict1, bins_dict2, rand_hash, resolution, tmpdir, region, b, e,))) pool.close() if verbose: print_progress(procs) pool.join() bin_coords = start_bin1, end_bin1, start_bin2, end_bin2 chunks = regs, begs, ends return regions, rand_hash, bin_coords, chunks
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.pdf' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) if not opts.fast_filter else None) except ValueError: raise ValueError('ERROR: probably all columns filtered out...') # bad columns bad_columns_file = path.join(opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.tsv' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases if not opts.filter_only: print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = cis_trans_N_d = float('nan') if not opts.filter_only: cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True ) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True ) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) if not opts.filter_only: print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % ( opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = 'NA' if not opts.filter_only: out_bias = open(bias_file, 'w') out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # pickle the HiC-data object print 'Saving genomic matrix' pickle_path = path.join(opts.workdir, '04_normalization', 'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash)) out = open(pickle_path, 'w') dump(hic_data, out) out.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: if not opts.filter_only: inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = None genom_map_raw_fig = None else: if not opts.filter_only: genom_map_nrm_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) if not opts.filter_only: genom_map_nrm_txt = path.join(opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join(opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, len(hic_data.bads.keys()), len(hic_data), intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, pickle_path, launch_time, finish_time)
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True, min_seq_len=15, windows=None, add_site=True, clean=False, **kwargs): """ Do the mapping :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to fastq file, either compressed or not. :param out_map_dir: path to a directory where to store mapped reads in MAP format . :param None r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. This is optional if frag_map option is False :param True frag_map: two step mapper, first full length is mapped, then remaining, unmapped reads, are divided into restriction-enzyme fragments andeach is mapped. :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. :param 15 min_seq_len: minimum size of a fragment to map :param None windows: tuple of ranges for begining and end of the mapping. This parameter allows to do classical iterative mapping, e.g. windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50)) A unique window can also be passed, for trimming, like this: windows=((1,101),) :param False clean: remove intermedite files created in temp_dir :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.map_parser.parse_map` """ outfiles = [] temp_dir = os.path.abspath(os.path.expanduser( kwargs.get('temp_dir', gettempdir()))) # create directories for rep in [temp_dir, out_map_dir]: mkdir(rep) # check space if get_free_space_mb(temp_dir, div=3) < 50: warn('WARNING: less than 50 Gb left on tmp_dir: %s\n' % temp_dir) # iterative mapping base_name = os.path.split(fastq_path)[-1].replace('.gz', '') base_name = base_name.replace('.fastq', '') input_reads = fastq_path if windows is None: windows = (None, ) for win in windows: # Prepare the FASTQ file and iterate over them curr_map = transform_fastq(input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], fastq=(input_reads.endswith('.fastq') or input_reads.endswith('.fastq.gz')), min_seq_len=min_seq_len, trim=win) # clean if input_reads != fastq_path and clean: print ' x removing original input %s' % input_reads os.system('rm -f %s' % (input_reads)) # First mapping, full length if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = curr_map + '_full_%s-%s.map' % (beg, end) if end: print 'Mapping reads in window %s-%s...' % (beg, end) else: print 'Mapping full reads...', curr_map map_file = gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs) map_file.close() # parse map file to extract not uniquely mapped reads print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_filt_%s-%s.map' % (beg, end), os.path.join(out_map_dir, base_name + '_full_%s-%s.map' % (beg, end))) # clean if clean: print ' x removing GEM input %s' % curr_map os.system('rm -f %s' % (curr_map)) print ' x removing map %s' % out_map_path os.system('rm -f %s' % (out_map_path)) # for next round, we will use remaining unmapped reads input_reads = curr_map + '_filt_%s-%s.map' % (beg, end) outfiles.append(os.path.join(out_map_dir, base_name + '_full_%s-%s.map' % (beg, end))) # map again splitting unmapped reads into RE fragments # (no need to trim this time) if frag_map: if not r_enz: raise Exception('ERROR: need enzyme name to fragment.') frag_map = transform_fastq(input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz, add_site=add_site) out_map_path = frag_map + '_frag.map' print 'Mapping fragments of remaining reads...' map_file = gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs) map_file.close() print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_fail.map', os.path.join(out_map_dir, base_name + '_frag.map')) outfiles.append(os.path.join(out_map_dir, base_name + '_frag.map')) return outfiles
def run(opts): check_options(opts) samtools = which(opts.samtools) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bam1: mreads1 = path.realpath(opts.bam1) biases1 = opts.biases1 else: biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) try: biases1 = path.join(opts.workdir1, biases1) except AttributeError: biases1 = None if opts.bam2: mreads2 = path.realpath(opts.bam2) biases2 = opts.biases2 else: biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) try: biases2 = path.join(opts.workdir2, biases2) except AttributeError: biases2 = None filter_exclude = opts.filter if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: printime(' - loading first sample %s' % (mreads1)) hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) printime(' - loading second sample %s' % (mreads2)) hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) printime(' - comparing experiments') printime(' => correlation between equidistant loci') corr, _, scc, std, bads = correlate_matrices( hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print ' - correlation score (SCC): %.4f (+- %.7f)' % (scc, std) printime(' => correlation between eigenvectors') eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) printime(' => reproducibility score') reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm, verbose=False, remove_bad_columns=True) print ' - reproducibility score: %.4f' % (reprod) ncols = len(hic_data1) else: ncols = 0 decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' corr = eig_corr = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) printime(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2)) printime(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (opts.cpus, outbam)) else: system(samtools + ' index %s' % (outbam)) finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), ncols, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr, biases1, biases2, launch_time, finish_time) printime('\nDone.')
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True, min_seq_len=15, windows=None, add_site=True, clean=False, get_nread=False, **kwargs): """ Do the mapping :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to fastq file, either compressed or not. :param out_map_dir: path to a directory where to store mapped reads in MAP format . :param None r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. This is optional if frag_map option is False :param True frag_map: two step mapper, first full length is mapped, then remaining, unmapped reads, are divided into restriction-enzyme fragments andeach is mapped. :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. :param 15 min_seq_len: minimum size of a fragment to map :param None windows: tuple of ranges for begining and end of the mapping. This parameter allows to do classical iterative mapping, e.g. windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50)) A unique window can also be passed, for trimming, like this: windows=((1,101),) :param False clean: remove intermedite files created in temp_dir :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param False get_nreads: returns a list of lists where each element contains a path and the number of reads processed :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.map_parser.parse_map` """ skip = kwargs.get('skip', False) suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) outfiles = [] temp_dir = os.path.abspath(os.path.expanduser( kwargs.get('temp_dir', gettempdir()))) # create directories for rep in [temp_dir, out_map_dir]: mkdir(rep) # check space if get_free_space_mb(temp_dir, div=3) < 50: warn('WARNING: less than 50 Gb left on tmp_dir: %s\n' % temp_dir) # iterative mapping base_name = os.path.split(fastq_path)[-1].replace('.gz', '') base_name = base_name.replace('.fastq', '') input_reads = fastq_path if windows is None: windows = (None, ) elif isinstance(windows[0], int): windows = [tuple(windows)] else: # ensure that each element is a tuple, not a list windows = [tuple(win) for win in windows] for win in windows: # Prepare the FASTQ file and iterate over them curr_map, counter = transform_fastq( input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], fastq=( input_reads.endswith('.fastq' ) or input_reads.endswith('.fastq.gz') or input_reads.endswith('.fq.gz' ) or input_reads.endswith('.dsrc' )), min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads) # clean if input_reads != fastq_path and clean: print ' x removing original input %s' % input_reads os.system('rm -f %s' % (input_reads)) # First mapping, full length if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix) if end: print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix) else: print 'Mapping full reads...', curr_map if not skip: gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs) # parse map file to extract not uniquely mapped reads print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) # clean if clean: print ' x removing GEM input %s' % curr_map os.system('rm -f %s' % (curr_map)) print ' x removing map %s' % out_map_path os.system('rm -f %s' % (out_map_path)) # for next round, we will use remaining unmapped reads input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix) outfiles.append( (os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter)) # map again splitting unmapped reads into RE fragments # (no need to trim this time) if frag_map: if not r_enz: raise Exception('ERROR: need enzyme name to fragment.') frag_map, counter = transform_fastq( input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz, add_site=add_site, skip=skip, nthreads=nthreads) if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix) if not skip: print 'Mapping fragments of remaining reads...' gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs) print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix), os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) outfiles.append((os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter)) if get_nread: return outfiles return [out for out, _ in outfiles]
def check_options(opts): if not opts.mapper_binary: if opts.mapper == 'gem': opts.mapper_binary = 'gem-mapper' else: opts.mapper_binary = opts.mapper opts.mapper_binary = which(opts.mapper_binary) if not opts.mapper_binary: raise Exception( '\n\nERROR: Mapper binary not found, for GEM install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') opts.gem_version = 0 if opts.mapper == 'gem': opts.gem_version = None try: out, _ = Popen([opts.mapper_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() opts.gem_version = int(out[1]) except ValueError as e: opts.gem_version = 2 print('Falling to gem v2') if opts.fast_fragment: if opts.gem_version < 3: raise Exception('ERROR: Fast fragment mapping needs GEM v3') if not opts.fastq2 or not path.exists(opts.fastq2): raise Exception( 'ERROR: Fast fragment mapping needs both fastq files. ' 'Please specify --fastq2') if opts.read != 0: raise Exception( 'ERROR: Fast fragment mapping needs to be specified with --read 0' ) if not opts.genome: raise Exception('ERROR: Fast fragment mapping needs ' 'the genome parameter.') # check RE name if opts.renz == ['CHECK']: print('\nSearching for most probable restriction enzyme in file: %s' % (opts.fastq)) try: pat, enz, pv = identify_re(opts.fastq, nreads=100000) print(' -> Most probable digested site: %s (pv: %f)' % (pat, pv)) print(' -> Enzymes matching: %s' % (', '.join(enz))) except ValueError: print(' -> Nothing found...') exit() for n, renz in enumerate(opts.renz): if renz == 'NONE': opts.renz[n] = None continue try: _ = RESTRICTION_ENZYMES[renz] except KeyError: print('\n\nERROR: restriction enzyme %s not found.' % (renz) + 'Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() except AttributeError: pass # check skip if not path.exists(opts.workdir) and opts.skip: print('WARNING: can use output files, found, not skipping...') opts.skip = False # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check paths if opts.mapper == 'gem' and not path.exists(opts.index): raise IOError('ERROR: index file not found at ' + opts.index) if not path.exists(opts.fastq): raise IOError('ERROR: FASTQ file not found at ' + opts.fastq) if not is_fastq(opts.fastq): raise IOError( ('ERROR: FASTQ file %s wrong format, check') % (opts.fastq)) try: opts.windows = [[int(i) for i in win.split(':')] for win in opts.windows] except TypeError: pass mkdir(opts.workdir) # write log # if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format( opts.fastq, opts.read) # else: # log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print('Writing log to ' + path.join(opts.workdir, 'process.log')) logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='a+') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='a+') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # check mapper extra options if opts.mapper_param: if (len(opts.mapper_param) == 1 and ('-' in opts.mapper_param[0] or '--' in opts.mapper_param[0])): # Single string surrounded by quotes opts.mapper_param = opts.mapper_param[0].split() else: opts.mapper_param = dict([o.split(':') for o in opts.mapper_param]) else: opts.mapper_param = {} if opts.mapper == 'gem' and opts.gem_version < 3: gem_valid_option = set([ "granularity", "q", "quality-format", "gem-quality-threshold", "mismatch-alphabet", "m", "e", "min-matched-bases", "max-big-indel-length", "s", "strata-after-best", "fast-mapping", "unique-mapping", "d", "D", "allow-incomplete-strata", "max-decoded-matches", "min-decoded-strata", "p", "paired-end-alignment", "b", "map-both-ends", "min-insert-size", "max-insert-size", "E", "max-extendable-matches", "max-extensions-per-match", "unique-pairing" ]) for k in opts.mapper_param: if not k in gem_valid_option: raise NotImplementedError( ('ERROR: option "%s" not a valid GEM option' 'or not suported by this tool.') % k) # create empty DB if don't exists dbpath = path.join(opts.workdir, 'trace.db') open(dbpath, 'a').close() # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # check if job already run using md5 digestion of parameters if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit('WARNING: exact same job already computed, see JOBs table above')
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, get_md5=True) if opts.nosql: biases = opts.biases mreads = opts.mreads inputs = [] elif opts.biases or opts.mreads: if not opts.mreads: raise Exception('ERROR: also need to provide BAM file') if not opts.biases: raise Exception('ERROR: also need to provide biases file') biases = opts.biases mreads = opts.mreads inputs = ['NA', 'NA'] mkdir(path.join(opts.workdir)) else: biases, mreads, biases_id, mreads_id = load_parameters_fromdb(opts) inputs = [biases_id, mreads_id] # store path ids to be saved in database mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) reso = opts.reso mkdir(path.join(opts.workdir, '06_segmentation')) print 'loading %s \n at resolution %s' % (mreads, nice(reso)) region = None if opts.crms and len(opts.crms) == 1: region = opts.crms[0] hic_data = load_hic_data_from_bam(mreads, reso, ncpus=opts.cpus, region=region, biases=None if opts.all_bins else biases, filter_exclude=opts.filter) # compartments cmp_result = {} richA_stats = {} firsts = {} if not opts.only_tads: print 'Searching compartments' cmprt_dir = path.join(opts.workdir, '06_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) if opts.fasta: print ' - Computing GC content to label compartments' rich_in_A = get_gc_content(parse_fasta(opts.fasta, chr_filter=opts.crms), reso, chromosomes=opts.crms, by_chrom=True, n_cpus=opts.cpus) elif opts.rich_in_A: rich_in_A = opts.rich_in_A else: rich_in_A = None n_evs = opts.n_evs if opts.n_evs > 0 else 3 firsts, richA_stats = hic_data.find_compartments( crms=opts.crms, savefig=cmprt_dir, verbose=True, suffix=param_hash, rich_in_A=rich_in_A, show_compartment_labels=rich_in_A is not None, savecorr=cmprt_dir if opts.savecorr else None, max_ev=n_evs, ev_index=opts.ev_index, vmin=None if opts.fix_corr_scale else 'auto', vmax=None if opts.fix_corr_scale else 'auto') for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes): if not crm in firsts: continue ev_file = open(path.join( cmprt_dir, '%s_EigVect%d_%s.tsv' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash)), 'w') ev_file.write('# %s\n' % ('\t'.join( 'EV_%d (%.4f)' % (i, v) for i, v in enumerate(firsts[crm][0], 1)))) ev_file.write('\n'.join(['\t'.join([str(v) for v in vs]) for vs in zip(*firsts[crm][1])])) ev_file.close() for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes): cmprt_file1 = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) cmprt_file2 = path.join(cmprt_dir, '%s_EigVect%d_%s.tsv' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash)) cmprt_image = path.join(cmprt_dir, '%s_EV%d_%s.%s' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash, opts.format)) if opts.savecorr: cormat_file = path.join(cmprt_dir, '%s_corr-matrix%s.tsv' % (crm, param_hash)) else: cormat_file = None hic_data.write_compartments(cmprt_file1, chroms=[crm]) cmp_result[crm] = {'path_cmprt1': cmprt_file1, 'path_cmprt2': cmprt_file2, 'path_cormat': cormat_file, 'image_cmprt': cmprt_image, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '06_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential if hic_data.bads: to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) else: to_rm = None # maximum size of a TAD max_tad_size = (size - 1) if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=opts.verbose, max_tad_size=max_tad_size, no_heuristic=False) # use normalization to compute height on TADs called if opts.all_bins: if opts.nosql: biases = load(open(biases)) else: biases = load(open(path.join(opts.workdir, biases))) hic_data.bads = biases['badcol'] hic_data.bias = biases['biases'] tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s\t%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: try: save_to_db(opts, cmp_result, tad_result, reso, inputs, richA_stats, firsts, param_hash, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() # hash that gonna be append to output file names param_hash = digest_parameters(opts, get_md5=True) # create tmp directory if not opts.tmp: temp_dir = opts.workdir + '_tmp_r%d_%s' % (opts.read, param_hash) else: temp_dir = path.join(opts.tmp, 'TADbit_tmp_r%d_%s' % (opts.read, param_hash)) # QC plot fig_path = path.join( opts.workdir, '%s_%s_%s.png' % (path.split(opts.fastq)[-1], '-'.join( map(str, opts.renz)), param_hash)) logging.info('Generating Hi-C QC plot') dangling_ends, ligated = quality_plot(opts.fastq, r_enz=opts.renz, nreads=100000, paired=False, savefig=fig_path) for renz in dangling_ends: logging.info(' - Dangling-ends (sensu-stricto): %.3f%%', dangling_ends[renz]) for renz in ligated: logging.info(' - Ligation sites: %.3f%%', ligated[renz]) if opts.skip_mapping: save_to_db(opts, dangling_ends, ligated, fig_path, [], launch_time, time.localtime()) return # Mapping if opts.fast_fragment: mkdir(path.join(opts.workdir, '03_filtered_reads')) logging.info('parsing genomic sequence') try: # allows the use of pickle genome to make it faster genome_seq = load(open(opts.genome[0], 'rb')) except (UnpicklingError, KeyError): genome_seq = parse_fasta(opts.genome) logging.info('mapping %s and %s to %s', opts.fastq, opts.fastq2, opts.workdir) outfiles = fast_fragment_mapping( opts.index, opts.fastq, opts.fastq2, opts.renz, genome_seq, path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash), clean=not opts.keep_tmp, get_nread=True, mapper_binary=opts.mapper_binary, mapper_params=opts.mapper_param, suffix=param_hash, temp_dir=temp_dir, nthreads=opts.cpus) else: logging.info('mapping %s read %s to %s', opts.fastq, opts.read, opts.workdir) outfiles = full_mapping(opts.index, opts.fastq, path.join(opts.workdir, '01_mapped_r%d' % (opts.read)), mapper=opts.mapper, r_enz=opts.renz, temp_dir=temp_dir, nthreads=opts.cpus, frag_map=not opts.iterative, clean=not opts.keep_tmp, windows=opts.windows, get_nread=True, skip=opts.skip, suffix=param_hash, mapper_binary=opts.mapper_binary, mapper_params=opts.mapper_param) # adjust line count if opts.skip: for i, (out, _) in enumerate(outfiles[1:], 1): outfiles[i] = out, outfiles[i - 1][1] - sum( 1 for _ in open(outfiles[i - 1][0])) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, dangling_ends, ligated, fig_path, outfiles, launch_time, finish_time) try: save_to_db(opts, dangling_ends, ligated, fig_path, outfiles, launch_time, finish_time) except Exception as e: # release lock remove(path.join(opts.workdir, '__lock_db')) print_exc() exit(1) # write machine log try: while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: mlog.write('\n'.join([('# MAPPED READ%s\t%d\t%s' % (opts.read, num, out)) for out, num in outfiles]) + '\n') # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass except Exception as e: # release lock remove(path.join(opts.workdir, '__lock_db')) print_exc() exit(1) # clean if not opts.keep_tmp: logging.info('cleaning temporary files') system('rm -rf ' + temp_dir)
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start, range_stop, **kwargs): """ Map iteratively a given FASTQ file to a reference genome. :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to fastq file, either compressed or not. :param out_sam_path: path to a directory where to store mapped reads in SAM/ BAM format (see option output_is_bam). :param range_start: list of integers representing the start position of each read fragment to be mapped (starting at 1 includes the first nucleotide of the read). :param range_stop: list of integers representing the end position of each read fragment to be mapped. :param True single_end: when FASTQ contains paired-ends flags :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param -1 max_reads_per_chunk: maximum number of reads to process at a time. If -1, all reads will be processed in one run (more RAM memory needed). :param False output_is_bam: Use binary (compressed) form of generated out-files with mapped reads (recommended to save disk space). :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.sam_parser.parse_sam` """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path)) single_end = kwargs.get('single_end', True) max_edit_distance = kwargs.get('max_edit_distance', 0.04) mismatches = kwargs.get('mismatches', 0.04) nthreads = kwargs.get('nthreads', 4) max_reads_per_chunk = kwargs.get('max_reads_per_chunk', -1) out_files = kwargs.get('out_files', []) output_is_bam = kwargs.get('output_is_bam', False) temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', tempfile.gettempdir()))) # check kwargs for kw in kwargs: if not kw in [ 'single_end', 'nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir' ]: warn('WARNING: %s not is usual keywords, misspelled?' % kw) # check windows: if not isinstance(range_start, list) or not isinstance(range_stop, list): if (not isinstance(range_start, tuple) or not isinstance(range_stop, tuple)): raise Exception( 'ERROR: range_start and range_stop should be lists') range_start = list(range_start) range_stop = list(range_stop) if (not all(isinstance(i, int) for i in range_start) or not all(isinstance(i, int) for i in range_stop)): try: range_start = map(int, range_start) range_stop = map(int, range_stop) warn('WARNING: range_start and range_stop converted to integers') except ValueError: raise Exception( 'ERROR: range_start and range_stop should contain' + ' integers only') if (len(zip(range_start, range_stop)) < len(range_start) or len(range_start) != len(range_stop)): raise Exception('ERROR: range_start and range_stop should have the ' + 'same sizes and windows should be uniques.') if any([i >= j for i, j in zip(range_start, range_stop)]): raise Exception('ERROR: start positions should always be lower than ' + 'stop positions.') if any([i <= 0 for i in range_start]): raise Exception('ERROR: start positions should be strictly positive.') # create directories for rep in [temp_dir, os.path.split(out_sam_path)[0]]: mkdir(rep) #get the length of a read if fastq_path.endswith('.gz'): fastqh = gzip.open(fastq_path) else: fastqh = open(fastq_path) # get the length from the length of the second line, which is the sequence # can not use the "length" keyword, as it is not always present try: _ = fastqh.next() raw_seq_len = len(fastqh.next().strip()) fastqh.close() except StopIteration: raise IOError('ERROR: problem reading %s\n' % fastq_path) if not N_WINDOWS: N_WINDOWS = len(range_start) # Split input files if required and apply iterative mapping to each # segment separately. if max_reads_per_chunk > 0: kwargs['max_reads_per_chunk'] = -1 print 'Split input file %s into chunks' % fastq_path chunked_files = _chunk_file( fastq_path, os.path.join(temp_dir, os.path.split(fastq_path)[1]), max_reads_per_chunk * 4) print '%d chunks obtained' % len(chunked_files) for i, fastq_chunk_path in enumerate(chunked_files): global N_WINDOWS N_WINDOWS = 0 print 'Run iterative_mapping recursively on %s' % fastq_chunk_path out_files.extend( iterative_mapping(gem_index_path, fastq_chunk_path, out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:], **kwargs)) for i, fastq_chunk_path in enumerate(chunked_files): # Delete chunks only if the file was really chunked. if len(chunked_files) > 1: print 'Remove the chunks: %s' % ' '.join(chunked_files) os.remove(fastq_chunk_path) return out_files # end position according to sequence in the file # removes 1 in order to start at 1 instead of 0 try: seq_end = range_stop.pop(0) seq_beg = range_start.pop(0) except IndexError: return out_files # define what we trim seq_len = seq_end - seq_beg trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1) # output local_out_sam = out_sam_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop), seq_beg, seq_end) out_files.append(local_out_sam) # input inputf = gem.files.open(fastq_path) # trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam/bam if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.split(fastq_path)[1] if unmapped_fastq_path[-1].isdigit(): unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0] unmapped_fastq_path = os.path.join( temp_dir, unmapped_fastq_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop), seq_beg, seq_end)) _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path) out_files.extend( iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path, range_start, range_stop, **kwargs)) os.remove(unmapped_fastq_path) return out_files
def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter ncpus = opts.cpus if opts.biases: biases = load(open(opts.biases)) else: biases = {} outdir = opts.outdir tmpdir = opts.tmpdir coord1 = opts.coord1 coord2 = opts.coord2 if biases and biases['resolution'] != resolution: raise Exception( 'ERROR: different resolution in bias file (you want %d,' ' there is %d).\n' % (resolution, biases['resolution'])) if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None mkdir(outdir) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') write_matrix(inbam, resolution, biases, outdir, filter_exclude=filter_exclude, normalizations=opts.matrices, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, nchunks=opts.nchunks, append_to_tar=opts.tarfile, ncpus=ncpus, tmpdir=tmpdir, verbose=not opts.quiet)
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start, range_stop, **kwargs): """ Map iteratively a given FASTQ file to a reference genome. :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to fastq file, either compressed or not. :param out_sam_path: path to a directory where to store mapped reads in SAM/ BAM format (see option output_is_bam). :param range_start: list of integers representing the start position of each read fragment to be mapped (starting at 1 includes the first nucleotide of the read). :param range_stop: list of integers representing the end position of each read fragment to be mapped. :param True single_end: when FASTQ contains paired-ends flags :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param -1 max_reads_per_chunk: maximum number of reads to process at a time. If -1, all reads will be processed in one run (more RAM memory needed). :param False output_is_bam: Use binary (compressed) form of generated out-files with mapped reads (recommended to save disk space). :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.sam_parser.parse_sam` """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path)) single_end = kwargs.get('single_end' , True) max_edit_distance = kwargs.get('max_edit_distance' , 0.04) mismatches = kwargs.get('mismatches' , 0.04) nthreads = kwargs.get('nthreads' , 4) max_reads_per_chunk = kwargs.get('max_reads_per_chunk' , -1) out_files = kwargs.get('out_files' , []) output_is_bam = kwargs.get('output_is_bam' , False) temp_dir = os.path.abspath(os.path.expanduser( kwargs.get('temp_dir', tempfile.gettempdir()))) # check kwargs for kw in kwargs: if not kw in ['single_end', 'nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir']: warn('WARNING: %s not is usual keywords, misspelled?' % kw) # check windows: if not isinstance(range_start, list) or not isinstance(range_stop, list): if (not isinstance(range_start, tuple) or not isinstance(range_stop, tuple)): raise Exception('ERROR: range_start and range_stop should be lists') range_start = list(range_start) range_stop = list(range_stop) if (not all(isinstance(i, int) for i in range_start) or not all(isinstance(i, int) for i in range_stop)): try: range_start = map(int, range_start) range_stop = map(int, range_stop) warn('WARNING: range_start and range_stop converted to integers') except ValueError: raise Exception('ERROR: range_start and range_stop should contain' + ' integers only') if (len(zip(range_start, range_stop)) < len(range_start) or len(range_start) != len(range_stop)): raise Exception('ERROR: range_start and range_stop should have the ' + 'same sizes and windows should be uniques.') if any([i >= j for i, j in zip(range_start, range_stop)]): raise Exception('ERROR: start positions should always be lower than ' + 'stop positions.') if any([i <= 0 for i in range_start]): raise Exception('ERROR: start positions should be strictly positive.') # create directories for rep in [temp_dir, os.path.split(out_sam_path)[0]]: mkdir(rep) #get the length of a read if fastq_path.endswith('.gz'): fastqh = gzip.open(fastq_path) else: fastqh = open(fastq_path) # get the length from the length of the second line, which is the sequence # can not use the "length" keyword, as it is not always present try: _ = fastqh.next() raw_seq_len = len(fastqh.next().strip()) fastqh.close() except StopIteration: raise IOError('ERROR: problem reading %s\n' % fastq_path) if not N_WINDOWS: N_WINDOWS = len(range_start) # Split input files if required and apply iterative mapping to each # segment separately. if max_reads_per_chunk > 0: kwargs['max_reads_per_chunk'] = -1 print 'Split input file %s into chunks' % fastq_path chunked_files = _chunk_file( fastq_path, os.path.join(temp_dir, os.path.split(fastq_path)[1]), max_reads_per_chunk * 4) print '%d chunks obtained' % len(chunked_files) for i, fastq_chunk_path in enumerate(chunked_files): global N_WINDOWS N_WINDOWS = 0 print 'Run iterative_mapping recursively on %s' % fastq_chunk_path out_files.extend(iterative_mapping( gem_index_path, fastq_chunk_path, out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:], **kwargs)) for i, fastq_chunk_path in enumerate(chunked_files): # Delete chunks only if the file was really chunked. if len(chunked_files) > 1: print 'Remove the chunks: %s' % ' '.join(chunked_files) os.remove(fastq_chunk_path) return out_files # end position according to sequence in the file # removes 1 in order to start at 1 instead of 0 try: seq_end = range_stop.pop(0) seq_beg = range_start.pop(0) except IndexError: return out_files # define what we trim seq_len = seq_end - seq_beg trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1) # output local_out_sam = out_sam_path + '.%d:%d-%d' % ( N_WINDOWS - len(range_stop), seq_beg, seq_end) out_files.append(local_out_sam) # input inputf = gem.files.open(fastq_path) # trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam/bam if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.split(fastq_path)[1] if unmapped_fastq_path[-1].isdigit(): unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0] unmapped_fastq_path = os.path.join( temp_dir, unmapped_fastq_path + '.%d:%d-%d' % ( N_WINDOWS - len(range_stop), seq_beg, seq_end)) _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path) out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path, range_start, range_stop, **kwargs)) os.remove(unmapped_fastq_path) return out_files
def hic_map(data, resolution=None, normalized=False, masked=None, by_chrom=False, savefig=None, show=False, savedata=None, focus=None, clim=None, cmap='jet', pdf=False, decay=True, perc=10, name=None, decay_resolution=None, **kwargs): """ function to retrieve data from HiC-data object. Data can be stored as a square matrix, or drawn using matplotlib :param data: can be either a path to a file with pre-processed reads (filtered or not), or a Hi-C-data object :param None resolution: at which to bin the data (try having a dense matrix with < 10% of cells with zero interaction counts). Note: not necessary if a hic_data object is passed as 'data'. :param False normalized: used normalized data, based on precalculated biases :param masked: a list of columns to be removed. Usually because to few interactions :param False by_chrom: data can be stored in a partitioned way. This parameter can take the values of: * 'intra': one output per each chromosome will be created * 'inter': one output per each possible pair of chromosome will be created * 'all' : both of the above outputs :param None savefig: path where to store the output images. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None savedata: path where to store the output matrices. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None focus: can be either two number (i.e.: (1, 100)) specifying the start and end position of the sub-matrix to display (start and end, along the diagonal of the original matrix); or directly a chromosome name; or two chromosome names (i.e.: focus=('chr2, chrX')), in order to store the data corresponding to inter chromosomal interactions between these two chromosomes :param True decay: plot the correlation between genomic distance and interactions (usually a decay). :param False force_image: force to generate an image even if resolution is crazy... :param None clim: cutoff for the upper and lower bound in the coloring scale of the heatmap :param False pdf: when using the bny_chrom option, to specify the format of the stored images :param Reds cmap: color map to be used for the heatmap :param None decay_resolution: chromatin fragment size to consider when calculating decay of the number of interactions with genomic distance. Default is equal to resolution of the matrix. """ if isinstance(data, str): data = load_hic_data_from_reads(data, resolution=resolution, **kwargs) if not kwargs.get('get_sections', True) and decay: warn('WARNING: not decay not available when get_sections is off.') decay = False hic_data = data resolution = data.resolution if not decay_resolution: decay_resolution = resolution if hic_data.bads and not masked: masked = hic_data.bads # save and draw the data if by_chrom: if focus: raise Exception('Incompatible options focus and by_chrom\n') if savedata: mkdir(savedata) if savefig: mkdir(savefig) for i, crm1 in enumerate(hic_data.chromosomes): for crm2 in hic_data.chromosomes.keys()[i:]: if by_chrom == 'intra' and crm1 != crm2: continue if by_chrom == 'inter' and crm1 == crm2: continue try: subdata = hic_data.get_matrix(focus=(crm1, crm2), normalized=normalized) start1, _ = hic_data.section_pos[crm1] start2, _ = hic_data.section_pos[crm2] masked1 = {} masked2 = {} if focus and hic_data.bads: # rescale masked masked1 = dict([(m - start1, hic_data.bads[m]) for m in hic_data.bads]) masked2 = dict([(m - start2, hic_data.bads[m]) for m in hic_data.bads]) if masked1 or masked2: for i in xrange(len(subdata)): if i in masked1: subdata[i] = [float('nan') for j in xrange(len(subdata))] for j in xrange(len(subdata)): if j in masked2: subdata[i][j] = float('nan') if savedata: hic_data.write_matrix('%s/%s.mat' % ( savedata, '_'.join(set((crm1, crm2)))), focus=(crm1, crm2), normalized=normalized) if show or savefig: if (len(subdata) > 10000 and not kwargs.get('force_image', False)): warn('WARNING: Matrix image not created, more than ' '10000 rows, use a lower resolution to create images') continue draw_map(subdata, OrderedDict([(k, hic_data.chromosomes[k]) for k in hic_data.chromosomes.keys() if k in [crm1, crm2]]), hic_data.section_pos, '%s/%s.%s' % (savefig, '_'.join(set((crm1, crm2))), 'pdf' if pdf else 'png'), show, one=True, clim=clim, cmap=cmap, decay_resolution=decay_resolution, perc=perc, name=name, cistrans=float('NaN')) except ValueError, e: print 'Value ERROR: problem with chromosome %s' % crm1 print str(e) except IndexError, e: print 'Index ERROR: problem with chromosome %s' % crm1 print str(e)
def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter ncpus = opts.cpus if opts.biases: biases = load(open(opts.biases)) else: biases = {} outdir = opts.outdir tmpdir = opts.tmpdir coord1 = opts.coord1 coord2 = opts.coord2 if biases and biases['resolution'] != resolution: raise Exception('ERROR: different resolution in bias file (you want %d,' ' there is %d).\n' % (resolution, biases['resolution'])) if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None mkdir(outdir) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') write_matrix(inbam, resolution, biases, outdir, filter_exclude=filter_exclude, normalizations=opts.matrices, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, nchunks=opts.nchunks, append_to_tar=opts.tarfile, ncpus=ncpus, tmpdir=tmpdir, verbose=not opts.quiet)
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] if not opts.mapped1 and not opts.mapped2: f_names1, f_names2, renz = load_parameters_fromdb( opts, reads, opts.jobids) else: if opts.mapped1: f_names1 = opts.mapped1 if opts.mapped2: f_names2 = opts.mapped2 renz = opts.renz renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of pickle genome to make it faster genome = load(open(opts.genome[0], 'rb')) except (UnpicklingError, KeyError): genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) if opts.mapped1 or opts.mapped2: counts, multis = parse_sam(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
def find_compartments(self, crms=None, savefig=None, savedata=None, show=False, **kwargs): """ Search for A/B copartments in each chromsome of the Hi-C matrix. Hi-C matrix is normalized by the number interaction expected at a given distance, and by visibility (one iteration of ICE). A correlation matrix is then calculated from this normalized matrix, and its first eigenvector is used to identify compartments. Changes in sign marking boundaries between compartments. Result is stored as a dictionary of compartment boundaries, keys being chromsome names. :param 99 perc_zero: to filter bad columns :param 0.05 signal_to_noise: to calculate expected interaction counts, if not enough reads are observed at a given distance the observations of the distance+1 are summed. a signal to noise ratio of < 0.05 corresponds to > 400 reads. :param None crms: only runs these given list of chromosomes :param None savefig: path to a directory to store matrices with compartment predictions, one image per chromosome, stored under 'chromosome-name.png'. :param False show: show the plot :param None savedata: path to a new file to store compartment predictions, one file only. :param -1 vmin: for the color scale of the plotted map :param 1 vmax: for the color scale of the plotted map TODO: this is really slow... Notes: building the distance matrix using the amount of interactions instead of the mean correlation, gives generally worse results. """ if not self.bads: if kwargs.get('verbose', True): print 'Filtering bad columns %d' % 99 self.filter_columns(perc_zero=kwargs.get('perc_zero', 99), by_mean=False, silent=True) if not self.expected: if kwargs.get('verbose', True): print 'Normalizing by expected values' self.expected = expected(self, bads=self.bads, **kwargs) if not self.bias: if kwargs.get('verbose', True): print 'Normalizing by ICE (1 round)' self.normalize_hic(iterations=0) if savefig: mkdir(savefig) cmprts = {} for sec in self.section_pos: if crms and sec not in crms: continue if kwargs.get('verbose', False): print 'Processing chromosome', sec warn('Processing chromosome %s' % (sec)) matrix = [[(float(self[i,j]) / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in xrange(*self.section_pos[sec]) if not i in self.bads] for j in xrange(*self.section_pos[sec]) if not j in self.bads] if not matrix: # MT chromosome will fall there warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] continue for i in xrange(len(matrix)): for j in xrange(i+1, len(matrix)): matrix[i][j] = matrix[j][i] matrix = [list(m) for m in corrcoef(matrix)] try: # This eighs is very very fast, only ask for one eigvector _, evect = eigsh(array(matrix), k=1) except LinAlgError: warn('Chromosome %s too small to compute PC1' % (sec)) cmprts[sec] = [] # Y chromosome, or so... continue first = list(evect[:, -1]) beg, end = self.section_pos[sec] bads = [k - beg for k in self.bads if beg <= k <= end] _ = [first.insert(b, 0) for b in bads] _ = [matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads] _ = [matrix[i].insert(b, float('nan')) for b in bads for i in xrange(len(first))] breaks = [0] + [i for i, (a, b) in enumerate(zip(first[1:], first[:-1])) if a * b < 0] + [len(first)] breaks = [{'start': b, 'end': breaks[i+1]} for i, b in enumerate(breaks[: -1])] cmprts[sec] = breaks # calculate compartment internal density for k, cmprt in enumerate(cmprts[sec]): beg = self.section_pos[sec][0] beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg sec_matrix = [(self[i,j] / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in xrange(beg1, end1) if not i in self.bads for j in xrange(i, end1) if not j in self.bads] try: cmprt['dens'] = sum(sec_matrix) / len(sec_matrix) except ZeroDivisionError: cmprt['dens'] = 0. try: meanh = sum([cmprt['dens'] for cmprt in cmprts[sec]]) / len(cmprts[sec]) except ZeroDivisionError: meanh = 1. for cmprt in cmprts[sec]: try: cmprt['dens'] /= meanh except ZeroDivisionError: cmprt['dens'] = 1. gammas = {} for gamma in range(101): gammas[gamma] = _find_ab_compartments(float(gamma)/100, matrix, breaks, cmprts[sec], save=False) # print gamma, gammas[gamma] gamma = min(gammas.keys(), key=lambda k: gammas[k][0]) _ = _find_ab_compartments(float(gamma)/100, matrix, breaks, cmprts[sec], save=True) if savefig or show: vmin = kwargs.get('vmin', -1) vmax = kwargs.get('vmax', 1) if vmin == 'auto' == vmax: vmax = max([abs(npmin(matrix)), abs(npmax(matrix))]) vmin = -vmax plot_compartments(sec, first, cmprts, matrix, show, savefig + '/chr' + sec + '.pdf', vmin=vmin, vmax=vmax) plot_compartments_summary(sec, cmprts, show, savefig + '/chr' + sec + '_summ.pdf') self.compartments = cmprts if savedata: self.write_compartments(savedata)
def get_intersection(fname1, fname2, out_path, verbose=False): """ Merges the two files corresponding to each reads sides. Reads found in both files are merged and written in an output file. Dealing with multiple contacts: - a pairwise contact is created for each possible combnation of the multicontacts. The name of the read is extended by '# 1/3' in case the reported pairwise contact corresponds to the first of 3 possibles - it may happen that different contacts are mapped on a single RE fragment (if each are on different end), in which case: - if no other fragment from this read are mapped than, both are kept - otherwise, they are merged into one longer (as if they were mapped in the positive strand) :param fname1: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param fname2: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param out_path: path to an outfile. It will written in a similar format as the inputs :returns: final number of pair of interacting fragments, and a dictionary with the number of multiple contacts (keys of the dictionary being the number of fragment cought together, can be 3, 4, 5..) """ # Get the headers of the two files reads1 = magic_open(fname1) line1 = reads1.next() header1 = '' while line1.startswith('#'): if line1.startswith('# CRM'): header1 += line1 line1 = reads1.next() read1 = line1.split('\t', 1)[0] reads2 = magic_open(fname2) line2 = reads2.next() header2 = '' while line2.startswith('#'): if line2.startswith('# CRM'): header2 += line2 line2 = reads2.next() read2 = line2.split('\t', 1)[0] if header1 != header2: raise Exception('seems to be mapped onover different chromosomes\n') # prepare to write read pairs into different files # depending on genomic position nchunks = 1024 global CHROM_START CHROM_START = {} cum_pos = 0 for line in header1.split('\n'): if line.startswith('# CRM'): _, _, crm, pos = line.split() CHROM_START[crm] = cum_pos cum_pos += int(pos) lchunk = cum_pos / nchunks buf = dict([(i, []) for i in xrange(nchunks + 1)]) # prepare temporary directories tmp_dir = out_path + '_tmp_files' mkdir(tmp_dir) for i in xrange(nchunks / int(nchunks**0.5) + 1): mkdir(path.join(tmp_dir, 'rep_%03d' % i)) # iterate over reads in each of the two input files # and store them into a dictionary and then into temporary files # dicitonary ois emptied each 1 milion entries if verbose: print ('Getting intersection of reads 1 and reads 2:') count = 0 count_dots = -1 multiples = {} try: while True: if verbose: if not count_dots % 10: stdout.write(' ') if not count_dots % 50: stdout.write('%s\n ' % ( (' %4d milion reads' % (count_dots)) if count_dots else '')) if count_dots >= 0: stdout.write('.') stdout.flush() count_dots += 1 for _ in xrange(1000000): # iterate 1 million times, write to files # same read id in both lianes, we store put the more upstream # before and store them if eq_reads(read1, read2): count += 1 _process_lines(line1, line2, buf, multiples, lchunk) line1 = reads1.next() read1 = line1.split('\t', 1)[0] line2 = reads2.next() read2 = line2.split('\t', 1)[0] # if first element of line1 is greater than the one of line2: elif gt_reads(read1, read2): line2 = reads2.next() read2 = line2.split('\t', 1)[0] else: line1 = reads1.next() read1 = line1.split('\t', 1)[0] write_to_files(buf, tmp_dir, nchunks) except StopIteration: reads1.close() reads2.close() write_to_files(buf, tmp_dir, nchunks) if verbose: print '\nFound %d pair of reads mapping uniquely' % count # sort each tmp file according to first element (idx) and write them # to output file (without the idx) # sort also according to read 2 (to filter duplicates) # and also according to strand if verbose: print 'Sorting each temporary file by genomic coordinate' out = open(out_path, 'w') out.write(header1) for b in buf: if verbose: stdout.write('\r %4d/%d sorted files' % (b + 1, len(buf))) stdout.flush() out.write(''.join(['\t'.join(l[1:]) for l in sorted( [l.split('\t') for l in open( path.join(tmp_dir, 'rep_%03d' % (b / int(nchunks**0.5)), 'tmp_%05d.tsv' % b))], key=lambda x: (x[0], x[8], x[9], x[6]))])) out.close() if verbose: print '\nRemoving temporary files...' system('rm -rf ' + tmp_dir) return count, multiples
def read_bam(inbam, filter_exclude, resolution, min_count=2500, normalization='Vanilla', mappability=None, n_rsites=None, cg_content=None, sigma=2, ncpus=8, factor=1, outdir='.', extra_out='', only_valid=False, normalize_only=False, max_njobs=100, min_perc=None, max_perc=None, extra_bads=None): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm)]) start_bin = 0 end_bin = len(bins) total = len(bins) regs = [] begs = [] ends = [] njobs = min(total, max_njobs) + 1 nbins = total / njobs + 1 for i in range(start_bin, end_bin, nbins): if i + nbins > end_bin: # make sure that we stop nbins = end_bin - i try: (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1] except IndexError: try: (crm1, beg1), (crm2, end2) = bins[i], bins[-1] except IndexError: break if crm1 != crm2: end1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(end1 * resolution + resolution) # last nt included ends.append(end2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(end2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)]) printime(' - Parsing BAM (%d chunks)' % (len(regs))) bins_dict = dict([(j, i) for i, j in enumerate(bins)]) pool = mu.Pool(ncpus) procs = [] read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter for i, (region, start, end) in enumerate(zip(regs, begs, ends)): procs.append( pool.apply_async(read_bam_frag, args=( inbam, filter_exclude, bins, bins_dict, resolution, outdir, extra_out, region, start, end, ))) pool.close() print_progress(procs) pool.join() ## COLLECT RESULTS cisprc = {} printime(' - Collecting cis and total interactions per bin (%d chunks)' % (len(regs))) stdout.write(' ') for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)): if not countbin % 10 and countbin: stdout.write(' ') if not countbin % 50 and countbin: stdout.write(' %9s\n ' % ('%s/%s' % (countbin, len(regs)))) stdout.write('.') stdout.flush() fname = path.join( outdir, 'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) tmp_cisprc = load(open(fname)) system('rm -f %s' % fname) cisprc.update(tmp_cisprc) stdout.write('\n') printime(' - Removing columns with too few or too much interactions') if len(bamfile.references) == 1 and min_count is None: raise Exception("ERROR: only one chromosome can't filter by " "cis-percentage, set min_count instead") elif min_count is None and len(bamfile.references) > 1: badcol = filter_by_cis_percentage( cisprc, sigma=sigma, verbose=True, min_perc=min_perc, max_perc=max_perc, size=total, savefig=path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(resolution).replace(' ', ''), extra_out))) else: print( ' -> too few interactions defined as less than %9d ' 'interactions') % (min_count) badcol = {} countL = 0 countZ = 0 for c in xrange(total): if cisprc.get(c, [0, 0])[1] < min_count: badcol[c] = cisprc.get(c, [0, 0])[1] countL += 1 if not c in cisprc: countZ += 1 print ' -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % ( len(badcol), countZ, countL, total, float(len(badcol)) / total * 100) # no mappability will result in NaNs, better to filter out these columns if mappability: badcol.update((i, True) for i, m in enumerate(mappability) if not m) # add manually columns to bad columns if extra_bads: removed_manually = 0 for ebc in extra_bads: c, ebc = ebc.split(':') b, e = map(int, ebc.split('-')) b = b / resolution + section_pos[c][0] e = e / resolution + section_pos[c][0] removed_manually += (e - b) badcol.update(dict((p, 'manual') for p in xrange(b, e))) printime(' - Removed %d columns manually.' % removed_manually) raw_cisprc = sum( float(cisprc[k][0]) / cisprc[k][1] for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol)) printime(' - Rescaling sum of interactions per bins') size = len(bins) biases = [ float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1] for k in xrange(size) ] if normalization == 'Vanilla': printime(' - Vanilla normalization') mean_col = nanmean(biases) biases = dict( (k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases)) elif normalization == 'oneD': printime(' - oneD normalization') if len( set([ len(biases), len(mappability), len(n_rsites), len(cg_content) ])) > 1: print "biases", "mappability", "n_rsites", "cg_content" print len(biases), len(mappability), len(n_rsites), len(cg_content) raise Exception('Error: not all arrays have the same size') tmp_oneD = path.join(outdir, 'tmp_oneD_%s' % (extra_out)) mkdir(tmp_oneD) biases = oneD(tmp_dir=tmp_oneD, tot=biases, map=mappability, res=n_rsites, cg=cg_content) biases = dict((k, b) for k, b in enumerate(biases)) rmtree(tmp_oneD) else: raise NotImplementedError('ERROR: method %s not implemented' % normalization) # collect subset-matrices and write genomic one # out = open(os.path.join(outdir, # 'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w') printime(' - Getting sum of normalized bins') pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append(pool.apply_async(sum_nrm_matrix, args=( fname, biases, ))) pool.close() print_progress(procs) pool.join() # to correct biases sumnrm = sum(p.get() for p in procs) target = (sumnrm / float(size * size * factor))**0.5 biases = dict([(b, biases[b] * target) for b in biases]) if not normalize_only: printime(' - Computing Cis percentage') # Calculate Cis percentage pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append( pool.apply_async(get_cis_perc, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results cis = total = 0 for proc in procs: c, t = proc.get() cis += c total += t norm_cisprc = float(cis) / total print ' * Cis-percentage: %.1f%%' % (norm_cisprc * 100) else: norm_cisprc = 0. printime(' - Rescaling decay') # normalize decay by size of the diagonal, and by Vanilla correction # (all cells must still be equals to 1 in average) pool = mu.Pool(ncpus) procs = [] for i, (region, start, end) in enumerate(zip(regs, begs, ends)): fname = path.join( outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)) procs.append( pool.apply_async(sum_dec_matrix, args=(fname, biases, badcol, bins))) pool.close() print_progress(procs) pool.join() # collect results nrmdec = {} rawdec = {} for proc in procs: tmpnrm, tmpraw = proc.get() for c, d in tmpnrm.iteritems(): for k, v in d.iteritems(): try: nrmdec[c][k] += v rawdec[c][k] += tmpraw[c][k] except KeyError: try: nrmdec[c][k] = v rawdec[c][k] = tmpraw[c][k] except KeyError: nrmdec[c] = {k: v} rawdec[c] = {k: tmpraw[c][k]} # count the number of cells per diagonal # TODO: parallelize # find largest chromosome len_crms = dict( (c, section_pos[c][1] - section_pos[c][0]) for c in section_pos) # initialize dictionary ndiags = dict( (c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections) for crm in section_pos: beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1] chr_size = end_chr - beg_chr thesebads = [b for b in badcol if beg_chr <= b <= end_chr] for dist in xrange(1, chr_size): ndiags[crm][dist] += chr_size - dist # from this we remove bad columns # bad columns will only affect if they are at least as distant from # a border as the distance between the longest diagonal and the # current diagonal. bad_diag = set( ) # 2 bad rows can point to the same bad cell in diagonal maxp = end_chr - dist minp = beg_chr + dist for b in thesebads: if b < maxp: # not inclusive!! bad_diag.add(b) if b >= minp: bad_diag.add(b - dist) ndiags[crm][dist] -= len(bad_diag) # different behavior for longest diagonal: ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr for b in thesebads) # normalize sum per diagonal by total number of cells in diagonal signal_to_noise = 0.05 min_n = signal_to_noise**-2. # equals 400 when default for crm in sections: if not crm in nrmdec: nrmdec[crm] = {} rawdec[crm] = {} tmpdec = 0 # store count by diagonal tmpsum = 0 # store count by diagonal ndiag = 0 val = 0 previous = [ ] # store diagonals to be summed in case not reaching the minimum for k in ndiags[crm]: tmpdec += nrmdec[crm].get(k, 0.) tmpsum += rawdec[crm].get(k, 0.) previous.append(k) if tmpsum > min_n: ndiag = sum(ndiags[crm][k] for k in previous) val = tmpdec # backup of tmpdec kept for last ones outside the loop try: ratio = val / ndiag for k in previous: nrmdec[crm][k] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass previous = [] tmpdec = 0 tmpsum = 0 # last ones we average with previous result if len(previous) == len(ndiags[crm]): nrmdec[crm] = {} elif tmpsum < min_n: ndiag += sum(ndiags[crm][k] for k in previous) val += tmpdec try: ratio = val / ndiag for k in previous: nrmdec[crm][k] = ratio except ZeroDivisionError: # all columns at this distance are "bad" pass return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
def read_bam(inbam, filter_exclude, resolution, ncpus=8, region1=None, start1=None, end1=None, region2=None, start2=None, end2=None, nchunks=100, tmpdir='.', verbose=True, normalize=False, max_size=None): bamfile = AlignmentFile(inbam, 'rb') sections = OrderedDict( zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths])) # get chromosomes and genome sizes total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] # define genomic bins bins = [] for crm in sections: len_crm = sections[crm] bins.extend([(crm, i) for i in xrange(len_crm)]) if not bins: raise Exception('ERROR: Chromosome %s smaller than bin size\n' % (crm)) # define start, end position of region to grab start_bin1 = 0 end_bin1 = len(bins) + 1 regions = bamfile.references if region1: regions = [region1] if region2: regions.append(region2) else: total = len(bins) if start1 is not None or end1: raise Exception('ERROR: Cannot use start/end1 without region') if start1 is not None: start_bin1 = section_pos[region1][0] + start1 / resolution else: if region1: start_bin1 = section_pos[region1][0] else: start_bin1 = 0 start1 = 0 if end1 is not None: end_bin1 = section_pos[region1][0] + end1 / resolution else: if region1: end_bin1 = section_pos[region1][1] end1 = sections[region1] * resolution else: end_bin1 = total end1 = total * resolution # define chunks, using at most 100 sub-divisions of region1 total = end_bin1 - start_bin1 regs = [] begs = [] ends = [] njobs = min(total, nchunks) + 1 nbins = total / njobs + 1 for i in xrange(start_bin1, end_bin1, nbins): if i + nbins > end_bin1: # make sure that we stop at the right place nbins = end_bin1 - i try: (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1] except IndexError: (crm1, beg1), (crm2, fin2) = bins[i], bins[-1] if crm1 != crm2: fin1 = sections[crm1] beg2 = 0 regs.append(crm1) regs.append(crm2) begs.append(beg1 * resolution) begs.append(beg2 * resolution) ends.append(fin1 * resolution + resolution) # last nt included ends.append(fin2 * resolution + resolution - 1) # last nt not included (overlap with next window) else: regs.append(crm1) begs.append(beg1 * resolution) ends.append(fin2 * resolution + resolution - 1) ends[-1] += 1 # last nucleotide included # reduce dictionaries all_bins = [] seenbins = set() for crm in regions: beg_crm = section_pos[crm][0] if region1: start = start_bin1 - beg_crm end = end_bin1 - beg_crm else: start = 0 end = section_pos[crm][1] - section_pos[crm][0] all_bins.extend([(crm, i) for i in xrange(start, end) if not (crm, i) in seenbins]) seenbins = set(all_bins) del (seenbins) bins_dict1 = dict((j, i) for i, j in enumerate(all_bins)) if region2: if not region2 in section_pos: raise Exception('ERROR: chromosome %s not found' % region2) bins = [] beg_crm = section_pos[region2][0] if start2 is not None: start_bin2 = section_pos[region2][0] + start2 / resolution else: start_bin2 = section_pos[region2][0] start2 = 0 if end2 is not None: end_bin2 = section_pos[region2][0] + end2 / resolution else: end_bin2 = section_pos[region2][1] end2 = sections[region2] * resolution start = start_bin2 - beg_crm end = end_bin2 - beg_crm bins = [(region2, i) for i in xrange(start, end)] bins_dict2 = dict([(j, i) for i, j in enumerate(bins)]) else: start_bin2 = start_bin1 end_bin2 = end_bin1 bins_dict2 = bins_dict1 size1 = end_bin1 - start_bin1 size2 = end_bin2 - start_bin2 if verbose: printime('\n (Matrix size %dx%d)' % (size1, size2)) if max_size and max_size < size1 * size2: raise Exception(('ERROR: matrix too large ({0}x{1}) should be at most ' '{2}x{2}').format(size1, size2, int(max_size**0.5))) pool = mu.Pool(ncpus) # create random hash associated to the run: rand_hash = "%016x" % getrandbits(64) ## RUN! if verbose: printime('\n - Parsing BAM (%d chunks)' % (len(regs))) mkdir(os.path.join(tmpdir, '_tmp_%s' % (rand_hash))) # empty all_bins array if we are not going to normalize if not normalize: all_bins = [] procs = [] for i, (region, b, e) in enumerate(zip(regs, begs, ends)): if ncpus == 1: _read_bam_frag( inbam, filter_exclude, all_bins, bins_dict1, bins_dict2, rand_hash, resolution, tmpdir, region, b, e, ) else: procs.append( pool.apply_async(_read_bam_frag, args=( inbam, filter_exclude, all_bins, bins_dict1, bins_dict2, rand_hash, resolution, tmpdir, region, b, e, ))) pool.close() if verbose: print_progress(procs) pool.join() bin_coords = start_bin1, end_bin1, start_bin2, end_bin2 chunks = regs, begs, ends return regions, rand_hash, bin_coords, chunks
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bed1: mreads1 = path.realpath(opts.bed1) bad_co1 = opts.bad_co1 biases1 = opts.biases1 else: bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) if opts.bed2: mreads2 = path.realpath(opts.bed2) bad_co2 = opts.bad_co2 biases2 = opts.biases2 else: bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') print 'loading first sample', mreads1 hic_data1 = load_hic_data_from_reads(mreads1, opts.reso) print 'loading second sample', mreads2 hic_data2 = load_hic_data_from_reads(mreads2, opts.reso) if opts.norm and biases1: bad_co1 = path.join(opts.workdir1, bad_co1) print 'loading bad columns from first sample', bad_co1 hic_data1.bads = dict((int(l.strip()), True) for l in open(bad_co1)) biases1 = path.join(opts.workdir1, biases1) print 'loading biases from first sample', biases1 hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases1)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') if opts.norm and biases2: bad_co2 = path.join(opts.workdir2, bad_co2) print 'loading bad columns from second sample', bad_co2 hic_data2.bads = dict((int(l.strip()), True) for l in open(bad_co2)) biases2 = path.join(opts.workdir2, biases2) print 'loading biases from second sample', biases2 hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases2)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) else: decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' # if opts.norm: # has bias file if not opts.skip_comparison: print 'correlation between equidistant loci' corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print 'correlation between eigenvectors' eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) else: corr = eig_corr = None bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % ( param_hash)) nreads = merge_2d_beds(mreads1, mreads2, outbed) finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1, biases2, bad_co2, launch_time, finish_time)
def test_07_big_matrix(self): inbam = os_join(TEST_PATH, 'data', 'fake.bam') biases = os_join(TEST_PATH, 'data', 'biases3.pickle') outfile = os_join(TEST_PATH, 'lele', 'lala.tsv') tmppath = os_join(TEST_PATH, 'lele') mkdir(tmppath) nheader = write_big_matrix(inbam, RESOLUTION, biases, outfile, nchunks=100, wanted_chrom=None, wanted_pos1=None, wanted_pos2=None, dry_run=False, ncpus=8, tmpdir=tmppath, clean=True, verbose=False, square_size=100, waffle_radii=WINDOWS_SPAN, metric='loop') rand_hash = "%016x" % getrandbits(64) tmpdir = os_join(tmppath, '_tmp_%s' % (rand_hash)) mkdir(tmpdir) #sort all files for only read once per pair of peaks to extract sort_BAMtsv(nheader, outfile, tmpdir) system('rm -rf {}'.format(tmpdir)) fh = open(outfile) self.assertEqual(187515, sum(1 for l in fh)) fh.close() with open(outfile) as fh: for line in fh: if line.startswith('525\t723\t'): break b, e, r, p, c, vals = line.split() self.assertEqual(0.139, float(r)) self.assertEqual(0.216, float(p)) self.assertEqual(0.981, float(c)) self.assertEqual([ 0.903, 0.889, 1.401, 0.411, 0.814, 0.417, 0.856, 0.454, 0.8, 2.171, 0.892, 4.214, 0.433, 0, 0.402, 1.288, 0.455, 0.869, 0.852, 0.42, 0.919, 0, 0.842, 1.579, 0.405, 1.788, 1.706, 1.164, 1.265, 1.328, 1.281, 1.267, 1.249, 0, 0.431, 0.428, 1.116, 1.832, 1.698, 1.179, 0.405, 1.996, 1.639, 0.828, 0, 0.749, 0.365, 0.383, 0.391, 1.161, 0.795, 1.224, 0.866, 0.786, 1.932, 1.142, 1.186, 0.732, 0.798, 0.393, 0.421, 1.786, 0.852, 1.366, 0.39, 0.819, 2.621, 0.741, 1.611, 0.413, 1.371, 0.436, 1.051, 0.345, 0, 1.165, 1.14, 0.749, 1.272, 0.45, 1.789 ], [float(v) for v in vals.split(',')]) with open(outfile) as fh: for line in fh: if line.startswith('854\t988\t'): break b, e, r, p, c, vals = line.split() self.assertEqual(0.224, float(r)) self.assertEqual(0.0448, float(p)) self.assertEqual(1.394, float(c)) self.assertEqual([ 2.123, 1.106, 0.585, 0.572, 1.636, 1.681, 0.517, 0.534, 0.556, 1.521, 1.057, 1.059, 1.093, 0, 0, 1.04, 1.02, 1.062, 1.003, 2.09, 2.093, 2.047, 1.03, 1.058, 1.028, 2.123, 0, 2.441, 1.03, 2.062, 1.008, 1.441, 0.521, 1.013, 0, 1.088, 3.036, 1.055, 1.069, 1.045, 0.996, 0.512, 3.148, 2.167, 2.256, 0.504, 2.103, 0, 0, 0.993, 1.02, 1.486, 1.621, 0.562, 0.981, 2.044, 1.024, 0.5, 1.447, 1.983, 0.963, 1.988, 1.639, 1.007, 1.59, 0, 0.519, 2.473, 2.057, 1.498, 0.516, 0.537, 0.508, 1.059, 0, 0.524, 0.499, 0.513, 0.504, 0.521, 0 ], [float(v) for v in vals.split(',')]) system('rm -rf {}'.format(tmppath))
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1) if len(mappability[c]) < len(refs) / opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) / opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % ( nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bed1: mreads1 = path.realpath(opts.bed1) bad_co1 = opts.bad_co1 biases1 = opts.biases1 else: bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) if opts.bed2: mreads2 = path.realpath(opts.bed2) bad_co2 = opts.bad_co2 biases2 = opts.biases2 else: bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: print 'Comparison' print ' - loading first sample', mreads1 hic_data1 = load_hic_data_from_reads(mreads1, opts.reso) print ' - loading second sample', mreads2 hic_data2 = load_hic_data_from_reads(mreads2, opts.reso) if opts.norm and biases1: bad_co1 = path.join(opts.workdir1, bad_co1) print ' - loading bad columns from first sample', bad_co1 hic_data1.bads = dict( (int(l.strip()), True) for l in open(bad_co1)) biases1 = path.join(opts.workdir1, biases1) print ' - loading biases from first sample', biases1 hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases1)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') if opts.norm and biases2: bad_co2 = path.join(opts.workdir2, bad_co2) print ' - loading bad columns from second sample', bad_co2 hic_data2.bads = dict( (int(l.strip()), True) for l in open(bad_co2)) biases2 = path.join(opts.workdir2, biases2) print ' - loading biases from second sample', biases2 hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases2)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') decay_corr_dat = path.join( opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join( opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join( opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join( opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) else: hic_data1 = {} hic_data2 = {} decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' # if opts.norm: # has bias file if not opts.skip_comparison: print ' => correlation between equidistant loci' corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print ' => correlation between eigenvectors' eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) else: corr = eig_corr = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % (param_hash)) print '\nMergeing...' nreads = merge_2d_beds(mreads1, mreads2, outbed) finish_time = time.localtime() save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1, biases2, bad_co2, launch_time, finish_time) print '\n\nDone.'
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))) if len(fas - bam) <= 50: print('\n'.join([(' - ' + c) for c in (fas - bam)])) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1) if len(mappability[c]) < len(refs) // opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) // opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # pad mappability at the end if the size is close to gc_content if len(mappability)<len(gc_content) and len(mappability)/len(gc_content) > 0.95: mappability += [float('nan')] * (len(gc_content)-len(mappability)) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in range(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1` # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path, cis_limit=opts.cis_limit, trans_limit=opts.trans_limit, min_ratio=opts.ratio_limit, fast_filter=opts.fast_filter) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None if opts.figsize: opts.figsize = map(float, opts.figsize.split(',')) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v !='raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None if opts.plot and not opts.force_plot: if opts.interactive: max_size = 1500**2 else: max_size = 5000**2 else: max_size = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict(zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = OrderedDict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean, max_size=max_size) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemented ' 'for matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions) for p in range(starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(opts.reso, sep=''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: # transform matrix matrix = array([array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2)]) m = zeros_like(matrix) for bad1 in bads1: m[:,bad1] = 1 for bad2 in bads2: m[bad2,:] = 1 matrix = ma.masked_array(matrix, m) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s%s.%s' % ( norm, name, nicer(opts.reso, sep=''), ('_' + param_hash), '_tri' if opts.triangular else '', opts.format) out_plots[norm_string] = path.join(outdir, fnam) pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = 0 if start2 is None else start2 pltend2 = sections[regions[-1]] if end2 is None else end2 xlabel = '{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1) ylabel = '{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2) section_pos = OrderedDict((k, section_pos[k]) for k in section_pos if k in regions) ax1, _ = plot_HiC_matrix( matrix, triangular=opts.triangular, vmin=vmin, vmax=vmax, cmap=opts.cmap, figsize=opts.figsize, bad_color=opts.bad_color if norm != 'raw' else None) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % ( name, norm, nicer(opts.reso)), y=1.05) _format_axes(ax1, start1, end1, start2, end2, opts.reso, regions, section_pos, sections, opts.xtick_rotation, triangular=False) if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update(write_matrix( mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s '% tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def full_mapping(mapper_index_path, fastq_path, out_map_dir, mapper='gem', r_enz=None, frag_map=True, min_seq_len=15, windows=None, add_site=True, clean=False, get_nread=False, mapper_binary=None, mapper_params=None, **kwargs): """ Maps FASTQ reads to an indexed reference genome. Mapping can be done either without knowledge of the restriction enzyme used, or for experiments performed without one, like Micro-C (iterative mapping), or using the ligation sites created from the digested ends (fragment-based mapping). :param mapper_index_path: path to index file created from a reference genome using gem-index tool or bowtie2-build :param fastq_path: PATH to FASTQ file, either compressed or not. :param out_map_dir: path to a directory where to store mapped reads in MAP format . :param None r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. This is optional if frag_map option is False :param True frag_map: two step mapper, first full length is mapped, then remaining, unmapped reads, are divided into restriction-enzyme fragments andeach is mapped. :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. :param 15 min_seq_len: minimum size of a fragment to map :param None windows: tuple of ranges for beginning and end of the mapping. This parameter allows to do classical iterative mapping, e.g. windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50)) A unique window can also be passed, for trimming, like this: windows=((1,101),) :param False clean: remove intermediate files created in temp_dir :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param False get_nreads: returns a list of lists where each element contains a path and the number of reads processed :param gem-mapper mapper_binary: path to the binary mapper :param None mapper_params: extra parameters for the mapper :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.map_parser.parse_map` """ skip = kwargs.get('skip', False) suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) outfiles = [] temp_dir = os.path.abspath(os.path.expanduser( kwargs.get('temp_dir', gettempdir()))) if mapper_params: kwargs.update(mapper_params) # create directories for rep in [temp_dir, out_map_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name = os.path.split(fastq_path)[-1].replace('.gz', '') base_name = '.'.join(base_name.split('.')[:-1]) input_reads = fastq_path if windows is None: light_storage = True windows = (None, ) elif isinstance(windows[0], int): # if windows starts at zero we do not need to store all the sequence # otherwise we need it because sequence can be trimmed two times # in fragment based mapping light_storage = True if not windows[0] else False windows = [tuple(windows)] else: # ensure that each element is a tuple, not a list windows = [tuple(win) for win in windows] # in this case we will need to keep the information about original # sequence at any point, light storage is thus not possible. light_storage = False for win in windows: # Prepare the FASTQ file and iterate over them curr_map, counter = transform_fastq( input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], fastq=is_fastq(input_reads), min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if input_reads != fastq_path and clean: print ' x removing original input %s' % input_reads os.system('rm -f %s' % (input_reads)) # First mapping, full length if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix) if end: print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix) else: print 'Mapping full reads...', curr_map if not skip: if mapper == 'gem': _gem_mapping(mapper_index_path, curr_map, out_map_path, gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'), **kwargs) # parse map file to extract not uniquely mapped reads print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % ( beg, end, suffix))) elif mapper == 'bowtie2': _bowtie2_mapping(mapper_index_path, curr_map, out_map_path, bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'), bowtie2_params=mapper_params, **kwargs) # parse map file to extract not uniquely mapped reads print 'Parsing result...' _bowtie2_filter(out_map_path, curr_map, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) else: raise Exception('ERROR: unknown mapper.') # clean if clean: print ' x removing %s input %s' % (mapper.upper(),curr_map) os.system('rm -f %s' % (curr_map)) print ' x removing map %s' % out_map_path os.system('rm -f %s' % (out_map_path)) # for next round, we will use remaining unmapped reads input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix) outfiles.append( (os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter)) # map again splitting unmapped reads into RE fragments # (no need to trim this time) if frag_map: if not r_enz: raise Exception('ERROR: need enzyme name to fragment.') frag_map, counter = transform_fastq( input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz, add_site=add_site, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if clean: print ' x removing pre-%s input %s' % (mapper.upper(),input_reads) os.system('rm -f %s' % (input_reads)) if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix) if not skip: if mapper == 'gem': print 'Mapping fragments of remaining reads...' _gem_mapping(mapper_index_path, frag_map, out_map_path, gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'), **kwargs) print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix), os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) elif mapper == 'bowtie2': print 'Mapping fragments of remaining reads...' _bowtie2_mapping(mapper_index_path, frag_map, out_map_path, bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'), bowtie2_params=mapper_params, **kwargs) print 'Parsing result...' _bowtie2_filter(out_map_path, frag_map, curr_map + '_fail%s.map' % (suffix), os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) else: raise Exception('ERROR: unknown mapper.') # clean if clean: print ' x removing %s input %s' % (mapper.upper(),frag_map) os.system('rm -f %s' % (frag_map)) print ' x removing failed to map ' + curr_map + '_fail%s.map' % (suffix) os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix))) print ' x removing tmp mapped %s' % out_map_path os.system('rm -f %s' % (out_map_path)) outfiles.append((os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter)) if get_nread: return outfiles return [out for out, _ in outfiles]
def run(opts): check_options(opts) samtools = which(opts.samtools) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bam1: mreads1 = path.realpath(opts.bam1) biases1 = opts.biases1 else: biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) try: biases1 = path.join(opts.workdir1, biases1) except AttributeError: biases1 = None except TypeError: # Py3 biases1 = None if opts.bam2: mreads2 = path.realpath(opts.bam2) biases2 = opts.biases2 else: biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) try: biases2 = path.join(opts.workdir2, biases2) except AttributeError: biases2 = None except TypeError: # Py3 biases1 = None filter_exclude = opts.filter if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: printime(' - loading first sample %s' % (mreads1)) hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) printime(' - loading second sample %s' % (mreads2)) hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) if opts.workdir1 and opts.workdir2: masked1 = {'valid-pairs': {'count': 0}} masked2 = {'valid-pairs': {'count': 0}} else: masked1 = {'valid-pairs': {'count': sum(hic_data1.values())}} masked2 = {'valid-pairs': {'count': sum(hic_data2.values())}} decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) printime(' - comparing experiments') printime(' => correlation between equidistant loci') corr, _, scc, std, bads = correlate_matrices( hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print(' - correlation score (SCC): %.4f (+- %.7f)' % (scc, std)) printime(' => correlation between eigenvectors') eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) printime(' => reproducibility score') reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm, verbose=False, remove_bad_columns=True) print(' - reproducibility score: %.4f' % (reprod)) ncols = len(hic_data1) else: ncols = 0 decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' masked1 = {} masked2 = {} corr = eig_corr = scc = std = reprod = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) if not opts.skip_merge: outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) printime(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2)) printime(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE, universal_newlines=True).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (opts.cpus, outbam)) else: system(samtools + ' index %s' % (outbam)) else: outbam = '' finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(list(bads.keys())), ncols, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr, biases1, biases2, masked1, masked2, launch_time, finish_time) printime('\nDone.')
def check_options(opts): if opts.cfg: get_options_from_cfg(opts.cfg, opts) opts.gem_binary = which(opts.gem_binary) if not opts.gem_binary: raise Exception('\n\nERROR: GEM binary not found, install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') # check RE name try: _ = RESTRICTION_ENZYMES[opts.renz] except KeyError: print ('\n\nERROR: restriction enzyme not found. Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() except AttributeError: pass # check skip if not path.exists(opts.workdir) and opts.skip: print ('WARNING: can use output files, found, not skipping...') opts.skip = False # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check paths if not path.exists(opts.index): raise IOError('ERROR: index file not found at ' + opts.index) if not path.exists(opts.fastq): raise IOError('ERROR: FASTQ file not found at ' + opts.fastq) # create tmp directory if not opts.tmp: opts.tmp = opts.workdir + '_tmp_r%d' % opts.read try: opts.windows = [[int(i) for i in win.split(':')] for win in opts.windows] except TypeError: pass mkdir(opts.workdir) # write log # if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format(opts.fastq, opts.read) # else: # log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print 'Writing log to ' + path.join(opts.workdir, 'process.log') logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='aw') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='aw') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists(vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # check GEM mapper extra options if opts.gem_param: opts.gem_param = dict([o.split(':') for o in opts.gem_param]) else: opts.gem_param = {} gem_valid_option = set(["granularity", "q", "quality-format", "gem-quality-threshold", "mismatch-alphabet", "m", "e", "min-matched-bases", "max-big-indel-length", "s", "strata-after-best", "fast-mapping", "unique-mapping", "d", "D", "allow-incomplete-strata", "max-decoded-matches", "min-decoded-strata", "p", "paired-end-alignment", "b", "map-both-ends", "min-insert-size", "max-insert-size", "E", "max-extendable-matches", "max-extensions-per-match", "unique-pairing"]) for k in opts.gem_param: if not k in gem_valid_option: raise NotImplementedError(('ERROR: option "%s" not a valid GEM option' 'or not suported by this tool.') % k) # check if job already run using md5 digestion of parameters if already_run(opts): exit('WARNING: exact same job already computed, see JOBs table above')
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception( 'ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception( 'ERROR: missing restriction enzyme name for oneD normalization' ) if not opts.mappability: raise Exception( 'ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % ( len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception( 'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception( "ERROR: chromosomes in FASTA different the ones in BAM") # get mappability ~2 min printime(' - Parsing mappability') fh = open(opts.mappability) mappability = dict((c, []) for c in refs) line = fh.next() crmM, begM, endM, val = line.split() crm = crmM if crmM not in mappability: print(' skipping %s' % crmM) while crmM not in mappability: line = fh.next() crmM, begM, endM, val = line.split() crm = crmM while any(not mappability[c] for c in mappability): for begB in xrange(0, len(genome[crmM]), opts.reso): endB = begB + opts.reso tmp = 0 try: while True: crmM, begM, endM, val = line.split() if crm != crmM: try: while crmM not in refs: line = fh.next() crmM, _ = line.split('\t', 1) except StopIteration: pass break begM = int(begM) endM = int(endM) if endM > endB: weight = endB - begM if weight >= 0: tmp += weight * float(val) break weight = endM - (begM if begM > begB else begB) if weight < 0: break tmp += weight * float(val) line = fh.next() except StopIteration: pass mappability[crm].append(tmp / opts.reso) crm = crmM mappability = reduce(lambda x, y: x + y, (mappability[c] for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos - 200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols) bad_col_image = path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % (opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print(' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join( outdir, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': opts.reso }, out) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def main(): opts = get_options() inbam = opts.inbam resolution = opts.reso filter_exclude = opts.filter ncpus = opts.cpus if opts.biases: biases = load(open(opts.biases)) else: biases = {} outdir = opts.outdir coord1 = opts.coord1 coord2 = opts.coord2 if biases and biases['resolution'] != resolution: raise Exception( 'ERROR: different resolution in bias file (you want %d,' ' there is %d).\n' % (resolution, biases['resolution'])) if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None mkdir(outdir) if region1: if region1: sys.stdout.write('\nExtraction of %s' % (region1)) if start1: sys.stdout.write(':%s-%s' % (start1, end1)) else: sys.stdout.write(' (full chromosome)') if region2: sys.stdout.write(' intersection with %s' % (region2)) if start2: sys.stdout.write(':%s-%s\n' % (start2, end2)) else: sys.stdout.write(' (full chromosome)\n') else: sys.stdout.write('\n') else: sys.stdout.write('\nExtraction of full genome\n') read_bam(inbam, filter_exclude, resolution, biases, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, ncpus=ncpus, outdir=outdir) printime('\nDone.')
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns( perc_zero=opts.perc_zeros, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) except ValueError: hic_data.filter_columns( perc_zero=100, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) # bad columns bad_columns_file = path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=True, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = open(bias_file, 'w') out_bias.write( '\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: inter_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) inter_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) else: genom_map_nrm_fig = None genom_map_raw_fig = None genom_map_nrm_txt = path.join( opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join( opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, launch_time, finish_time)
def get_intersection(fname1, fname2, out_path, verbose=False, compress=False): """ Merges the two files corresponding to each reads sides. Reads found in both files are merged and written in an output file. Dealing with multiple contacts: - a pairwise contact is created for each possible combnation of the multicontacts. The name of the read is extended by '# 1/3' in case the reported pairwise contact corresponds to the first of 3 possibles - it may happen that different contacts are mapped on a single RE fragment (if each are on different end), in which case: - if no other fragment from this read are mapped than, both are kept - otherwise, they are merged into one longer (as if they were mapped in the positive strand) :param fname1: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param fname2: path to a tab separated file generated by the function :func:`pytadbit.parsers.sam_parser.parse_sam` :param out_path: path to an outfile. It will written in a similar format as the inputs :param False compress: compress (gzip) input files. This is done in the background while next input files are parsed. :returns: final number of pair of interacting fragments, and a dictionary with the number of multiple contacts (keys of the dictionary being the number of fragment cought together, can be 3, 4, 5..) """ # Get the headers of the two files reads1 = magic_open(fname1) line1 = next(reads1) header1 = '' while line1.startswith('#'): if line1.startswith('# CRM'): header1 += line1 line1 = next(reads1) read1 = line1.split('\t', 1)[0] reads2 = magic_open(fname2) line2 = next(reads2) header2 = '' while line2.startswith('#'): if line2.startswith('# CRM'): header2 += line2 line2 = next(reads2) read2 = line2.split('\t', 1)[0] if header1 != header2: raise Exception('seems to be mapped onover different chromosomes\n') # prepare to write read pairs into different files # depending on genomic position nchunks = 1024 global CHROM_START CHROM_START = {} cum_pos = 0 for line in header1.split('\n'): if line.startswith('# CRM'): _, _, crm, pos = line.split() CHROM_START[crm] = cum_pos cum_pos += int(pos) lchunk = cum_pos // nchunks buf = dict([(i, []) for i in range(nchunks + 1)]) # prepare temporary directories tmp_dir = out_path + '_tmp_files' mkdir(tmp_dir) for i in range(nchunks // int(nchunks**0.5) + 1): mkdir(path.join(tmp_dir, 'rep_%03d' % i)) # iterate over reads in each of the two input files # and store them into a dictionary and then into temporary files # dicitonary ois emptied each 1 milion entries if verbose: print ('Getting intersection of reads 1 and reads 2:') count = 0 count_dots = -1 multiples = {} try: while True: if verbose: if not count_dots % 10: stdout.write(' ') if not count_dots % 50: stdout.write('%s\n ' % ( (' %4d milion reads' % (count_dots)) if count_dots else '')) if count_dots >= 0: stdout.write('.') stdout.flush() count_dots += 1 for _ in range(1000000): # iterate 1 million times, write to files # same read id in both lianes, we store put the more upstream # before and store them if eq_reads(read1, read2): count += 1 _process_lines(line1, line2, buf, multiples, lchunk) line1 = next(reads1) read1 = line1.split('\t', 1)[0] line2 = next(reads2) read2 = line2.split('\t', 1)[0] # if first element of line1 is greater than the one of line2: elif gt_reads(read1, read2): line2 = next(reads2) read2 = line2.split('\t', 1)[0] else: line1 = next(reads1) read1 = line1.split('\t', 1)[0] write_to_files(buf, tmp_dir, nchunks) except StopIteration: reads1.close() reads2.close() write_to_files(buf, tmp_dir, nchunks) if verbose: print('\nFound %d pair of reads mapping uniquely' % count) # compression if compress: if verbose: print('compressing input files') procs = [Popen(['gzip', f]) for f in (fname1, fname2)] # sort each tmp file according to first element (idx) and write them # to output file (without the idx) # sort also according to read 2 (to filter duplicates) # and also according to strand if verbose: print('Sorting each temporary file by genomic coordinate') out = open(out_path, 'w') out.write(header1) for b in buf: if verbose: stdout.write('\r %4d/%d sorted files' % (b + 1, len(buf))) stdout.flush() with open(path.join(tmp_dir, 'rep_%03d' % (b // int(nchunks**0.5)), 'tmp_%05d.tsv' % b)) as f_tmp: out.write(''.join(['\t'.join(l[1:]) for l in sorted( [l.split('\t') for l in f_tmp], key=lambda x: (x[0], x[8], x[9], x[6]))])) out.close() if compress: for proc in procs: proc.communicate() system('rm -rf ' + fname1) system('rm -rf ' + fname2) if verbose: print('\nRemoving temporary files...') system('rm -rf ' + tmp_dir) return count, multiples