def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes( reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print (' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print (' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() # hash that gonna be append to output file names param_hash = digest_parameters(opts, get_md5=True) if opts.quality_plot: logging.info('Generating Hi-C QC plot at:\n ' + path.join(opts.workdir, path.split(opts.fastq)[-1] + '.pdf')) dangling_ends, ligated = quality_plot(opts.fastq, r_enz=opts.renz, nreads=100000, paired=False, savefig=path.join( opts.workdir, path.split(opts.fastq)[-1] + '.pdf')) logging.info(' - Dangling-ends (sensu-stricto): %.3f%%', dangling_ends) logging.info(' - Ligation sites: %.3f%%', ligated) return logging.info('mapping %s read %s to %s', opts.fastq, opts.read, opts.workdir) outfiles = full_mapping(opts.index, opts.fastq, path.join(opts.workdir, '01_mapped_r%d' % (opts.read)), r_enz=opts.renz, temp_dir=opts.tmp, nthreads=opts.cpus, frag_map=not opts.iterative, clean=not opts.keep_tmp, windows=opts.windows, get_nread=True, skip=opts.skip, suffix=param_hash, **opts.gem_param) # adjust line count if opts.skip: for i, (out, _) in enumerate(outfiles[1:], 1): outfiles[i] = out, outfiles[i-1][1] - sum(1 for _ in open(outfiles[i-1][0])) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, outfiles, launch_time, finish_time) # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: mlog.write('\n'.join([ ('# MAPPED READ%s\t%d\t%s' % (opts.read, num, out)) for out, num in outfiles]) + '\n') # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass
def run(opts): check_options(opts) launch_time = time.localtime() # hash that gonna be append to output file names param_hash = digest_parameters(opts, get_md5=True) if opts.quality_plot: logging.info('Generating Hi-C QC plot at:\n ' + path.join(opts.workdir, path.split(opts.fastq)[-1] + '.pdf')) dangling_ends, ligated = quality_plot(opts.fastq, r_enz=opts.renz, nreads=100000, paired=False, savefig=path.join( opts.workdir, path.split(opts.fastq)[-1] + '.pdf')) logging.info(' - Dangling-ends (sensu-stricto): %.3f%%', dangling_ends) logging.info(' - Ligation sites: %.3f%%', ligated) return logging.info('mapping %s read %s to %s', opts.fastq, opts.read, opts.workdir) outfiles = full_mapping(opts.index, opts.fastq, path.join(opts.workdir, '01_mapped_r%d' % (opts.read)), opts.renz, temp_dir=opts.tmp, nthreads=opts.cpus, frag_map=not opts.iterative, clean=opts.keep_tmp, windows=opts.windows, get_nread=True, skip=opts.skip, suffix=param_hash, **opts.gem_param) # adjust line count if opts.skip: for i, (out, _) in enumerate(outfiles[1:], 1): outfiles[i] = out, outfiles[i-1][1] - sum(1 for _ in open(outfiles[i-1][0])) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, outfiles, launch_time, finish_time) # write machine log with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: fcntl.flock(mlog, fcntl.LOCK_EX) mlog.write('\n'.join([ ('# MAPPED READ%s\t%d\t%s' % (opts.read, num, out)) for out, num in outfiles]) + '\n') fcntl.flock(mlog, fcntl.LOCK_UN) # clean if not opts.keep_tmp: logging.info('cleaning temporary files') system('rm -rf ' + opts.tmp)
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] if not opts.mapped1 and not opts.mapped2: f_names1, f_names2, renz = load_parameters_fromdb( opts, reads, opts.jobids) else: if opts.mapped1: f_names1 = opts.mapped1 if opts.mapped2: f_names2 = opts.mapped2 renz = opts.renz renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of pickle genome to make it faster genome = load(open(opts.genome[0], 'rb')) except (UnpicklingError, KeyError): genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) if opts.mapped1 or opts.mapped2: counts, multis = parse_sam(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
def save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time): con = lite.connect(path.join(opts.workdir, 'trace.db')) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='INTERSECTION_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table INTERSECTION_OUTPUTs (Id integer primary key, PATHid int, Total_interactions int, Multiple_interactions text, Median_fragment_length, MAD_fragment_length, Max_fragment_length, unique (PATHid))""") cur.execute(""" create table FILTER_OUTPUTs (Id integer primary key, PATHid int, Name text, Count int, JOBid int, unique (PATHid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', 'Filter', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, mreads, '2D_BED', jobid, opts.workdir) add_path(cur, reads, '2D_BED', jobid, opts.workdir) add_path(cur, hist_path, 'FIGURE', jobid, opts.workdir) try: cur.execute(""" insert into INTERSECTION_OUTPUTs (Id , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length) values (NULL, %d, %d, '%s', %d, %d, %d) """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join( ['%s:%d' % (k, multiples[k]) for k in sorted(multiples)]), median, mad, max_f)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute( 'delete from INTERSECTION_OUTPUTs where PATHid = %d' % (get_path_id(cur, mreads, opts.workdir))) cur.execute(""" insert into INTERSECTION_OUTPUTs (Id , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length) values (NULL, %d, %d, '%s', %d, %d, %d) """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join( ['%s:%d' % (k, multiples[k]) for k in sorted(multiples)]), median, mad, max_f)) for f in masked: add_path(cur, masked[f]['fnam'], 'FILTER', jobid, opts.workdir) try: cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir), masked[f]['name'], masked[f]['reads'], jobid)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute( 'delete from FILTER_OUTPUTs where PATHid = %d' % (get_path_id(cur, masked[f]['fnam'], opts.workdir))) cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir), masked[f]['name'], masked[f]['reads'], jobid)) try: cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs', n_valid_pairs, jobid)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute('delete from FILTER_OUTPUTs where PATHid = %d' % (get_path_id(cur, mreads, opts.workdir))) cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs', n_valid_pairs, jobid)) print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs') print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'FILTER_OUTPUTs')
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) if opts.fast_fragment: reads = fname1 counts_multis = [ '#' in line.split('\t')[0] for line in open(reads) ] count = len(counts_multis) multiples = {} multiples[1] = sum( [count_mult for count_mult in counts_multis if count_mult]) del counts_multis else: # compute the intersection of the two read ends print('Getting intersection between read 1 and read 2') count, multiples = get_intersection(fname1, fname2, reads, compress=opts.compress_input) # compute insert size print('Get insert size...') hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) try: median, max_f, mad = fragment_size(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) except ZeroDivisionError: warn('WARNING: cannot compute fragment length, too few ' 'dangling-ends. Setting median length to 400 nt.') median = max_f = mad = 0 if median < 50: warn('WARNING: fragment length too short ({}). ' 'Setting median length to 400 nt.'.format(mad)) median, max_f, mad = 400, 100, 40 if opts.median: median = opts.median if opts.max_f: max_f = opts.max_f if opts.mad: mad = opts.mad print(' - median insert size =', median) print(' - median absolution of insert size =', mad) print( ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f) max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print(' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends' % max_mole) print(' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks' % min_dist) print("identify pairs to filter...") masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, strict_duplicates=opts.strict_duplicates, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) if opts.valid: infile = mreads else: infile = reads bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked, samtools=opts.samtools) finish_time = time.localtime() print(median, max_f, mad) # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if not opts.nosql: (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) # store path ids to be saved in database inputs = bad_co_id, biases_id, mreads_id else: bad_co = opts.bad_co biases = opts.biases mreads = opts.mreads reso = opts.reso mreads = path.join(opts.workdir, mreads) bad_co = path.join(opts.workdir, bad_co) biases = path.join(opts.workdir, biases) mkdir(path.join(opts.workdir, '05_segmentation')) print 'loading %s at resolution %s' % (mreads, nice(reso)) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases)) # compartments cmp_result = {} if not opts.only_tads: print 'Searching compartments' hic_data.find_compartments(crms=opts.crms) cmprt_dir = path.join(opts.workdir, '05_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) for crm in opts.crms or hic_data.chromosomes: cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) hic_data.write_compartments(cmprt_file, chroms=[crm]) cmp_result[crm] = {'path': cmprt_file, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '05_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) # maximum size of a TAD max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=True, max_tad_size=max_tad_size, no_heuristic=True) tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: save_to_db(opts, cmp_result, tad_result, reso, inputs, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.pdf' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) if not opts.fast_filter else None) except ValueError: raise ValueError('ERROR: probably all columns filtered out...') # bad columns bad_columns_file = path.join(opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.tsv' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases if not opts.filter_only: print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = cis_trans_N_d = float('nan') if not opts.filter_only: cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True ) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True ) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) if not opts.filter_only: print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % ( opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = 'NA' if not opts.filter_only: out_bias = open(bias_file, 'w') out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # pickle the HiC-data object print 'Saving genomic matrix' pickle_path = path.join(opts.workdir, '04_normalization', 'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash)) out = open(pickle_path, 'w') dump(hic_data, out) out.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: if not opts.filter_only: inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = None genom_map_raw_fig = None else: if not opts.filter_only: genom_map_nrm_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) if not opts.filter_only: genom_map_nrm_txt = path.join(opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join(opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, len(hic_data.bads.keys()), len(hic_data), intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, pickle_path, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() # hash that gonna be append to output file names param_hash = digest_parameters(opts, get_md5=True) # create tmp directory if not opts.tmp: temp_dir = opts.workdir + '_tmp_r%d_%s' % (opts.read, param_hash) else: temp_dir = path.join(opts.tmp, 'TADbit_tmp_r%d_%s' % (opts.read, param_hash)) # QC plot fig_path = path.join( opts.workdir, '%s_%s_%s.png' % (path.split(opts.fastq)[-1], '-'.join( map(str, opts.renz)), param_hash)) logging.info('Generating Hi-C QC plot') dangling_ends, ligated = quality_plot(opts.fastq, r_enz=opts.renz, nreads=100000, paired=False, savefig=fig_path) for renz in dangling_ends: logging.info(' - Dangling-ends (sensu-stricto): %.3f%%', dangling_ends[renz]) for renz in ligated: logging.info(' - Ligation sites: %.3f%%', ligated[renz]) if opts.skip_mapping: save_to_db(opts, dangling_ends, ligated, fig_path, [], launch_time, time.localtime()) return # Mapping if opts.fast_fragment: mkdir(path.join(opts.workdir, '03_filtered_reads')) logging.info('parsing genomic sequence') try: # allows the use of pickle genome to make it faster genome_seq = load(open(opts.genome[0], 'rb')) except (UnpicklingError, KeyError): genome_seq = parse_fasta(opts.genome) logging.info('mapping %s and %s to %s', opts.fastq, opts.fastq2, opts.workdir) outfiles = fast_fragment_mapping( opts.index, opts.fastq, opts.fastq2, opts.renz, genome_seq, path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash), clean=not opts.keep_tmp, get_nread=True, mapper_binary=opts.mapper_binary, mapper_params=opts.mapper_param, suffix=param_hash, temp_dir=temp_dir, nthreads=opts.cpus) else: logging.info('mapping %s read %s to %s', opts.fastq, opts.read, opts.workdir) outfiles = full_mapping(opts.index, opts.fastq, path.join(opts.workdir, '01_mapped_r%d' % (opts.read)), mapper=opts.mapper, r_enz=opts.renz, temp_dir=temp_dir, nthreads=opts.cpus, frag_map=not opts.iterative, clean=not opts.keep_tmp, windows=opts.windows, get_nread=True, skip=opts.skip, suffix=param_hash, mapper_binary=opts.mapper_binary, mapper_params=opts.mapper_param) # adjust line count if opts.skip: for i, (out, _) in enumerate(outfiles[1:], 1): outfiles[i] = out, outfiles[i - 1][1] - sum( 1 for _ in open(outfiles[i - 1][0])) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, dangling_ends, ligated, fig_path, outfiles, launch_time, finish_time) try: save_to_db(opts, dangling_ends, ligated, fig_path, outfiles, launch_time, finish_time) except Exception as e: # release lock remove(path.join(opts.workdir, '__lock_db')) print_exc() exit(1) # write machine log try: while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: mlog.write('\n'.join([('# MAPPED READ%s\t%d\t%s' % (opts.read, num, out)) for out, num in outfiles]) + '\n') # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass except Exception as e: # release lock remove(path.join(opts.workdir, '__lock_db')) print_exc() exit(1) # clean if not opts.keep_tmp: logging.info('cleaning temporary files') system('rm -rf ' + temp_dir)
def run(opts): check_options(opts) samtools = which(opts.samtools) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bam1: mreads1 = path.realpath(opts.bam1) biases1 = opts.biases1 else: biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) try: biases1 = path.join(opts.workdir1, biases1) except AttributeError: biases1 = None if opts.bam2: mreads2 = path.realpath(opts.bam2) biases2 = opts.biases2 else: biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) try: biases2 = path.join(opts.workdir2, biases2) except AttributeError: biases2 = None filter_exclude = opts.filter if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: printime(' - loading first sample %s' % (mreads1)) hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) printime(' - loading second sample %s' % (mreads2)) hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) printime(' - comparing experiments') printime(' => correlation between equidistant loci') corr, _, scc, std, bads = correlate_matrices( hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print ' - correlation score (SCC): %.4f (+- %.7f)' % (scc, std) printime(' => correlation between eigenvectors') eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) printime(' => reproducibility score') reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm, verbose=False, remove_bad_columns=True) print ' - reproducibility score: %.4f' % (reprod) ncols = len(hic_data1) else: ncols = 0 decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' corr = eig_corr = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) printime(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2)) printime(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (opts.cpus, outbam)) else: system(samtools + ' index %s' % (outbam)) finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), ncols, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr, biases1, biases2, launch_time, finish_time) printime('\nDone.')
def run(opts): check_options(opts) launch_time = time.localtime() print( ''' %s%s - Region: Chromosome %s from %d to %d at resolution %s (%d particles) ''' % ('Preparing ' if opts.job_list else '', ('Optimization\n' + '*' * (21 if opts.job_list else 11)) if opts.optimize else ('Modeling\n' + '*' * (18 if opts.job_list else 8)), opts.crm, opts.ori_beg, opts.ori_end, nicer(opts.reso), opts.end - opts.beg)) # load data if opts.matrix: crm = load_hic_data(opts) else: # FIXME: copied from somewhere else (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id, reso) = load_parameters_fromdb(opts) hic_data = load_hic_data_from_reads(mreads, reso) hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co)) hic_data.bias = dict( (int(l.split()[0]), float(l.split()[1])) for l in open(biases)) exp = crm.experiments[0] opts.beg, opts.end = opts.beg or 1, opts.end or exp.size # prepare output folders batch_job_hash = digest_parameters( opts, get_md5=True, extra=[ 'maxdist', 'upfreq', 'lowfreq', 'scale', 'dcutoff', 'nmodels_run', 'job_list', 'rand', 'nmodels', 'nkeep', 'optimize', 'optimization_id', 'cpus', 'workdir', 'matrix', 'ori_beg', 'ori_end' ]) mkdir(path.join(opts.workdir, '06_model')) outdir = path.join( opts.workdir, '06_model', '%s_chr%s_%s-%s' % (batch_job_hash, opts.crm, opts.beg, opts.end)) mkdir(outdir) # in case we are not going to run if opts.job_list: job_file_handler = open( path.join( outdir, 'job_list_%s.q' % ('optimization' if opts.optimize else 'modeling')), 'w') else: job_file_handler = None ############### # Optimization print ' o Optimizing parameters' if opts.optimize: optimization(exp, opts, job_file_handler, outdir) finish_time = time.localtime() print('\n optimization done') # correlate all optimization and get best set of parameters if not (opts.optimize and opts.job_list): optpar, results = correlate_models(opts, outdir, exp) else: results = [] ########### # Modeling if not opts.optimize: big_run(exp, opts, job_file_handler, outdir, optpar) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, outdir, results, batch_job_hash, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v != 'raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict( zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = dict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemeted for ' 'matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1, p + opts.reso) for r, reg in enumerate(regions) for p in range( starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer( opts.reso).replace(' ', ''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join( str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: cmap = plt.get_cmap(opts.cmap) if norm != 'raw': cmap.set_bad('grey', 1.) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace( ' ', ''), ('_' + param_hash), opts.format) out_plots[norm_string] = path.join(outdir, fnam) if opts.interactive: _ = plt.figure(figsize=(8, 7)) else: _ = plt.figure(figsize=(16, 14)) # ax1 = plt.subplot(111) ax1 = plt.axes([0.1, 0.1, 0.7, 0.8]) ax2 = plt.axes([0.82, 0.1, 0.07, 0.8]) matrix = array([ array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2) ]) mini = np_min(matrix[nonzero(matrix)]) / 2. matrix[matrix == 0] = mini m = zeros_like(matrix) for bad1 in bads1: m[:, bad1] = 1 for bad2 in bads2: m[bad2, :] = 1 matrix = log2(ma.masked_array(matrix, m)) ax1.imshow(matrix, interpolation='None', origin='lower', cmap=cmap, vmin=vmin, vmax=vmax) if len(regions) <= 2: pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = pltbeg1 if len( regions) == 1 else 0 if start2 is None else start2 pltend2 = pltend1 if len(regions) == 1 else sections[ regions[-1]] if end2 is None else end2 ax1.set_xlabel('{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1)) ax1.set_ylabel('{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)) def format_xticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg1) return nicer(tickstring if tickstring else 1, coma=True) def format_yticks(tickstring, _=None): tickstring = int(tickstring * opts.reso + pltbeg2) return nicer(tickstring if tickstring else 1, coma=True) ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks)) ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks)) labels = ax1.get_xticklabels() plt.setp(labels, rotation=-25, ha='left') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) else: vals = [0] keys = [''] for crm in regions: vals.append(section_pos[crm][0] / opts.reso) keys.append(crm) vals.append(section_pos[crm][1] / opts.reso) ax1.set_yticks(vals) ax1.set_yticklabels('') ax1.set_yticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_yticklabels(keys, minor=True) for t in ax1.yaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xticks(vals) ax1.set_xticklabels('') ax1.set_xticks([ float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1) ], minor=True) ax1.set_xticklabels(keys, minor=True) for t in ax1.xaxis.get_minor_ticks(): t.tick1On = False t.tick2On = False ax1.set_xlabel('Chromosomes') ax1.set_ylabel('Chromosomes') ax1.set_xlim(-0.5, len(matrix[0]) - 0.5) ax1.set_ylim(-0.5, len(matrix) - 0.5) data = [i for d in matrix for i in d if isfinite(i)] mindata = nanmin(data) maxdata = nanmax(data) gradient = linspace(maxdata, mindata, max((len(matrix), len(matrix[0])))) gradient = dstack((gradient, gradient))[0] h = ax2.hist(data, color='darkgrey', linewidth=2, orientation='horizontal', bins=50, histtype='step', normed=True) _ = ax2.imshow(gradient, aspect='auto', cmap=cmap, extent=(0, max(h[0]), mindata, maxdata)) ax2.yaxis.tick_right() ax2.yaxis.set_label_position("right") ax2.set_xticks([]) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (name, norm, nicer(opts.reso))) ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90) ax2.set_xlabel('Count') if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update( write_matrix(mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s ' % tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, nbad_columns, ncolumns, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, biases2, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='MERGE_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute(""" create table FILTER_OUTPUTs (Id integer primary key, PATHid int, Name text, Count int, JOBid int, unique (PATHid))""") cur.execute(""" create table MERGE_OUTPUTs (Id integer primary key, JOBid int, Wrkd1Path int, Wrkd2Path int, Bed1Path int, Bed2Path int, MergePath int, unique (JOBid))""") cur.execute(""" create table MERGE_STATs (Id integer primary key, JOBid int, Inputs text, decay_corr text, eigen_corr text, reprod real, scc real, std_scc real, N_columns int, N_filtered int, Resolution int, bias1Path int, bias2Path int, unique (JOBid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Merge', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, decay_corr_dat, 'CORR' , jobid, opts.workdir) add_path(cur, decay_corr_fig, 'FIGURE' , jobid, opts.workdir) add_path(cur, eigen_corr_dat, 'CORR' , jobid, opts.workdir) add_path(cur, eigen_corr_fig, 'FIGURE' , jobid, opts.workdir) add_path(cur, opts.workdir , 'WORKDIR' , jobid) add_path(cur, opts.workdir1, 'WORKDIR1' , jobid, opts.workdir) add_path(cur, opts.workdir2, 'WORKDIR2' , jobid, opts.workdir) add_path(cur, mreads1 , 'EXT_HIC_BAM', jobid, opts.workdir) add_path(cur, mreads2 , 'EXT_HIC_BAM', jobid, opts.workdir) add_path(cur, outbed , 'HIC_BAM' , jobid, opts.workdir) if opts.norm: add_path(cur, biases1 , 'BIASES' , jobid, opts.workdir) add_path(cur, biases2 , 'BIASES' , jobid, opts.workdir) biasid1 = get_path_id(cur, biases1, opts.workdir) biasid2 = get_path_id(cur, biases2, opts.workdir) else: biasid1 = 0 biasid2 = 0 cur.execute("select id from paths where path = '%s'" % ( path.relpath(mreads1, opts.workdir))) bed1 = cur.fetchall()[0][0] if opts.workdir1: cur.execute("select id from paths where path = '%s'" % ( path.relpath(opts.workdir1, opts.workdir))) w1path = cur.fetchall()[0][0] else: w1path = 0 cur.execute("select id from paths where path = '%s'" % ( path.relpath(mreads2, opts.workdir))) bed2 = cur.fetchall()[0][0] if opts.workdir2: cur.execute("select id from paths where path = '%s'" % ( path.relpath(opts.workdir2, opts.workdir))) w2path = cur.fetchall()[0][0] else: w2path = 0 cur.execute("select id from paths where path = '%s'" % ( path.relpath(outbed, opts.workdir))) outbedid = cur.fetchall()[0][0] if not opts.skip_comparison: decay_corr = '-'.join(['%.1f' % (v) for v in corr[:10:2]]).replace('0.', '.') eigen_corr = '-'.join(['%.2f' % (max(v)) for v in eig_corr[:4]]).replace('0.', '.') else: decay_corr = eigen_corr = None cur.execute(""" insert into MERGE_OUTPUTs (Id , JOBid, Wrkd1Path, Wrkd2Path, Bed1Path, Bed2Path, MergePath) values (NULL, %d, %d, %d, %d, %d, %d) """ % (jobid, w1path, w2path, bed1, bed2, outbedid)) if not opts.skip_comparison: cur.execute(""" insert into MERGE_STATs (Id , JOBid, N_columns, N_filtered, Resolution, reprod, scc, std_scc, decay_corr, eigen_corr, bias1Path, bias2Path) values (NULL, %d, %d, %d, %d, %f, %f, %f, '%s', '%s', %d, %d) """ % (jobid, ncolumns, nbad_columns, opts.reso , reprod, scc, std, decay_corr, eigen_corr, biasid1, biasid2)) masked1 = {'valid-pairs': {'count': 0}} if opts.workdir1: if 'tmpdb' in opts and opts.tmpdb: # tmp file dbfile1 = opts.tmpdb1 try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir1, 'trace.db'), dbfile1) except IOError: pass else: dbfile1 = path.join(opts.workdir1, 'trace.db') tmpcon = lite.connect(dbfile1) with tmpcon: tmpcur = tmpcon.cursor() tmpcur.execute("select Name, PATHid, Count from filter_outputs") for name, pathid, count in tmpcur.fetchall(): res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid)) tmppath = res.fetchall()[0][0] masked1[name] = {'path': tmppath, 'count': count} if 'tmpdb' in opts and opts.tmpdb: remove(dbfile1) masked2 = {'valid-pairs': {'count': 0}} if opts.workdir2: if 'tmpdb' in opts and opts.tmpdb: # tmp file dbfile2 = opts.tmpdb2 try: # to copy in case read2 was already mapped for example copyfile(path.join(opts.workdir2, 'trace.db'), dbfile2) except IOError: pass else: dbfile2 = path.join(opts.workdir2, 'trace.db') tmpcon = lite.connect(dbfile2) with tmpcon: tmpcur = tmpcon.cursor() tmpcur.execute("select Name, PATHid, Count from filter_outputs") for name, pathid, count in tmpcur.fetchall(): res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid)) tmppath = res.fetchall()[0][0] masked2[name] = {'path': tmppath, 'count': count} if 'tmpdb' in opts and opts.tmpdb: remove(dbfile2) for f in masked1: if f != 'valid-pairs': outmask = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv_%s.tsv' % ( param_hash, f)) out = open(outmask, 'w') for line in open(path.join(opts.workdir1, masked1[f]['path'])): out.write(line) for line in open(path.join(opts.workdir2, masked2[f]['path'])): out.write(line) add_path(cur, outmask, 'FILTER', jobid, opts.workdir) else: outmask = outbed cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, outmask, opts.workdir), f, masked1[f]['count'] + masked2[f]['count'], jobid)) print_db(cur, 'PATHs') print_db(cur, 'JOBs') print_db(cur, 'MERGE_OUTPUTs') print_db(cur, 'MERGE_STATs') print_db(cur, 'FILTER_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam, hist_path, median, max_f, mad, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='INTERSECTION_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table INTERSECTION_OUTPUTs (Id integer primary key, PATHid int, Total_interactions int, Multiple_interactions text, Median_fragment_length, MAD_fragment_length, Max_fragment_length, unique (PATHid))""") cur.execute(""" create table FILTER_OUTPUTs (Id integer primary key, PATHid int, Name text, Count int, Applied text, JOBid int, unique (PATHid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', 'Filter', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, mreads, '2D_BED', jobid, opts.workdir) add_path(cur, outbam, 'HIC_BAM', jobid, opts.workdir) add_path(cur, outbam + '.bai', 'HIC_BAI', jobid, opts.workdir) add_path(cur, reads, '2D_BED', jobid, opts.workdir) add_path(cur, hist_path, 'FIGURE', jobid, opts.workdir) try: cur.execute(""" insert into INTERSECTION_OUTPUTs (Id , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length) values (NULL, %d, %d, '%s', %d, %d, %d) """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join(['%s:%d' % (k, multiples[k]) for k in sorted(multiples)]), median, mad, max_f)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute( 'delete from INTERSECTION_OUTPUTs where PATHid = %d' % ( get_path_id(cur, mreads, opts.workdir))) cur.execute(""" insert into INTERSECTION_OUTPUTs (Id , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length) values (NULL, %d, %d, '%s', %d, %d, %d) """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join(['%s:%d' % (k, multiples[k]) for k in sorted(multiples)]), median, mad, max_f)) for nf, f in enumerate(masked, 1): try: add_path(cur, masked[f]['fnam'], 'FILTER', jobid, opts.workdir) except KeyError: continue try: cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, Applied, JOBid) values (NULL, %d, '%s', '%s', '%s', %d) """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir), masked[f]['name'], masked[f]['reads'], 'True' if nf in opts.apply else 'False', jobid)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute( 'delete from FILTER_OUTPUTs where PATHid = %d' % ( get_path_id(cur, masked[f]['fnam'], opts.workdir))) cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, Applied, JOBid) values (NULL, %d, '%s', '%s', '%s', %d) """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir), masked[f]['name'], masked[f]['reads'], 'True' if nf in opts.apply else 'False', jobid)) try: cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, Applied, JOBid) values (NULL, %d, '%s', '%s', '%s', %d) """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs', n_valid_pairs, '', jobid)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute( 'delete from FILTER_OUTPUTs where PATHid = %d' % ( get_path_id(cur, mreads, opts.workdir))) cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, Applied, JOBid) values (NULL, %d, '%s', '%s', '%s', %d) """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs', n_valid_pairs, '', jobid)) print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs') print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'FILTER_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bed1: mreads1 = path.realpath(opts.bed1) bad_co1 = opts.bad_co1 biases1 = opts.biases1 else: bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) if opts.bed2: mreads2 = path.realpath(opts.bed2) bad_co2 = opts.bad_co2 biases2 = opts.biases2 else: bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') print 'loading first sample', mreads1 hic_data1 = load_hic_data_from_reads(mreads1, opts.reso) print 'loading second sample', mreads2 hic_data2 = load_hic_data_from_reads(mreads2, opts.reso) if opts.norm and biases1: bad_co1 = path.join(opts.workdir1, bad_co1) print 'loading bad columns from first sample', bad_co1 hic_data1.bads = dict((int(l.strip()), True) for l in open(bad_co1)) biases1 = path.join(opts.workdir1, biases1) print 'loading biases from first sample', biases1 hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases1)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') if opts.norm and biases2: bad_co2 = path.join(opts.workdir2, bad_co2) print 'loading bad columns from second sample', bad_co2 hic_data2.bads = dict((int(l.strip()), True) for l in open(bad_co2)) biases2 = path.join(opts.workdir2, biases2) print 'loading biases from second sample', biases2 hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases2)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) else: decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' # if opts.norm: # has bias file if not opts.skip_comparison: print 'correlation between equidistant loci' corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print 'correlation between eigenvectors' eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) else: corr = eig_corr = None bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % ( param_hash)) nreads = merge_2d_beds(mreads1, mreads2, outbed) finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1, biases2, bad_co2, launch_time, finish_time)
def save_to_db(opts, outdir, results, batch_job_hash, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='JOBs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='MODELED_REGIONs'""") if not cur.fetchall(): cur.execute(""" create table MODELED_REGIONs (Id integer primary key, PATHid int, PARAM_md5 text, RESO int, BEG int, END int, unique (PARAM_md5))""") cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='MODELs'""") if not cur.fetchall(): cur.execute(""" create table MODELs (Id integer primary key, REGIONid int, JOBid int, OPTPAR_md5 text, MaxDist int, UpFreq int, LowFreq int, Scale int, Cutoff int, Nmodels int, Kept int, Correlation int)""") try: parameters = digest_parameters(opts, get_md5=False) # In case optimization or modeling is split in different computers param_hash = digest_parameters(opts, get_md5=True) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', '%s', '%s') """ % ((parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), (('PRE_' if opts.job_list else '') + ('OPTIM' if opts.optimize else 'MODEL')), param_hash))) except lite.IntegrityError: pass ##### STORE OPTIMIZATION RESULT jobid = get_jobid(cur) add_path(cur, outdir, 'DIR', jobid, opts.workdir) pathid = get_path_id(cur, outdir, opts.workdir) # models = compile_models(opts, outdir, exp=exp, ngood=opts.nkeep) ### STORE GENERAL OPTIMIZATION INFO try: cur.execute(""" insert into MODELED_REGIONs (Id , PATHid, PARAM_md5, RESO, BEG, END) values (NULL, %d, "%s", %d, %d, %d) """ % (pathid, batch_job_hash, opts.reso, opts.beg, opts.end)) except lite.IntegrityError: pass ### STORE EACH OPTIMIZATION cur.execute("SELECT Id from MODELED_REGIONs where PARAM_md5='%s'" % (batch_job_hash)) optimid = cur.fetchall()[0][0] for m, u, l, d, s in results: optpar_md5 = md5('%s%s%s%s%s' % (m, u, l, d, s)).hexdigest()[:12] cur.execute( ("SELECT Id from MODELs where " "OPTPAR_md5='%s' and REGIONid='%s'") % (optpar_md5, optimid)) if not cur.fetchall(): cur.execute(""" insert into MODELs (Id , REGIONid, JOBid, OPTPAR_md5, MaxDist, UpFreq, LowFreq, Cutoff, Scale, Nmodels, Kept, Correlation) values (NULL, %d, %d, '%s', %s, %s, %s, %s, %s, %d, %d, %f) """ % ((optimid, jobid, optpar_md5, m, u, l, d, s, results[(m, u, l, d, s)]['nmodels'], results[(m, u, l, d, s)]['kept'], results[(m, u, l, d, s)]['corr']))) else: cur.execute( ("update MODELs " "set Nmodels = %d, Kept = %d, Correlation = %f " "where " "OPTPAR_md5='%s' and REGIONid='%s'") % (results[(m, u, l, d, s)]['nmodels'], results[(m, u, l, d, s)]['kept'], results[(m, u, l, d, s)]['corr'], optpar_md5, optimid)) ### MODELING if not opts.optimization_id: cur.execute("SELECT Id from MODELED_REGIONs") optimid = cur.fetchall()[0] if len(optimid) > 1: raise IndexError("ERROR: more than 1 optimization in folder " "choose with 'tadbit describe' and " "--optimization_id") optimid = optimid[0] else: cur.execute("SELECT Id from MODELED_REGIONs where Id=%d" % (opts.optimization_id)) optimid = cur.fetchall()[0][0] if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bed1: mreads1 = path.realpath(opts.bed1) bad_co1 = opts.bad_co1 biases1 = opts.biases1 else: bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) if opts.bed2: mreads2 = path.realpath(opts.bed2) bad_co2 = opts.bad_co2 biases2 = opts.biases2 else: bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: print 'Comparison' print ' - loading first sample', mreads1 hic_data1 = load_hic_data_from_reads(mreads1, opts.reso) print ' - loading second sample', mreads2 hic_data2 = load_hic_data_from_reads(mreads2, opts.reso) if opts.norm and biases1: bad_co1 = path.join(opts.workdir1, bad_co1) print ' - loading bad columns from first sample', bad_co1 hic_data1.bads = dict( (int(l.strip()), True) for l in open(bad_co1)) biases1 = path.join(opts.workdir1, biases1) print ' - loading biases from first sample', biases1 hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases1)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') if opts.norm and biases2: bad_co2 = path.join(opts.workdir2, bad_co2) print ' - loading bad columns from second sample', bad_co2 hic_data2.bads = dict( (int(l.strip()), True) for l in open(bad_co2)) biases2 = path.join(opts.workdir2, biases2) print ' - loading biases from second sample', biases2 hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1])) for l in open(biases2)) elif opts.norm: raise Exception('ERROR: biases or filtered-columns not found') decay_corr_dat = path.join( opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join( opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join( opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join( opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) else: hic_data1 = {} hic_data2 = {} decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' # if opts.norm: # has bias file if not opts.skip_comparison: print ' => correlation between equidistant loci' corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print ' => correlation between eigenvectors' eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) else: corr = eig_corr = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % (param_hash)) print '\nMergeing...' nreads = merge_2d_beds(mreads1, mreads2, outbed) finish_time = time.localtime() save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1, biases2, bad_co2, launch_time, finish_time) print '\n\nDone.'
def save_to_db(opts, bias_file, mreads, bad_col_image, nbad_columns, ncolumns, raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, bam_filter, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='JOBs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='NORMALIZE_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table NORMALIZE_OUTPUTs (Id integer primary key, JOBid int, Input int, N_columns int, N_filtered int, BAM_filter int, Cis_percentage_Raw real, Cis_percentage_Norm real, Slope_700kb_10Mb real, Resolution int, Normalization text, Factor int, unique (JOBid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Normalize', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, bias_file , 'BIASES' , jobid, opts.workdir) add_path(cur, bad_col_image , 'FIGURE' , jobid, opts.workdir) add_path(cur, inter_vs_gcoord , 'FIGURE' , jobid, opts.workdir) if opts.bam: add_path(cur, path.realpath(opts.bam), 'EXT_2D_BAM' , jobid, opts.workdir) if opts.mappability: add_path(cur, path.realpath(opts.mappability), 'EXT_MAPPABILITY' , jobid, opts.workdir) if opts.fasta: add_path(cur, path.realpath(opts.fasta), 'EXT_FASTA' , jobid, opts.workdir) # get pathid of input cur.execute("select id from paths where path = '%s'" % (path.relpath(mreads, opts.workdir))) input_bed = cur.fetchall()[0][0] a2 = 0 if isnan(a2) else a2 try: cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, N_columns, N_filtered, BAM_filter, Cis_percentage_Raw, Cis_percentage_Norm, Slope_700kb_10Mb, Resolution, Normalization, Factor) values (NULL, %d, %d, %d, %d, %d, %f, %f, %f, %d, '%s', %f) """ % (jobid, input_bed, ncolumns, nbad_columns, bam_filter, 100 * raw_cisprc, 100 * norm_cisprc, a2, opts.reso, opts.normalization, opts.factor)) except lite.OperationalError: try: cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, N_columns, N_filtered, BAM_filter, Cis_percentage_Raw, Cis_percentage_Norm, Slope_700kb_10Mb, Resolution, Normalization, Factor) values (NULL, %d, %d, %d, %d, %d, %f, %f, %f, %d, '%s', %f) """ % (jobid, input_bed, ncolumns, nbad_columns, bam_filter, 100 * raw_cisprc, 100 * norm_cisprc, a2, opts.reso, opts.normalization, opts.factor)) except lite.OperationalError: print 'WANRING: Normalized table not written!!!' print_db(cur, 'PATHs') print_db(cur, 'JOBs') try: print_db(cur, 'FILTER_OUTPUTs') print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'FILTER_OUTPUTs') except lite.OperationalError: pass print_db(cur, 'NORMALIZE_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) if opts.zrange: vmin = float(opts.zrange.split(',')[0]) vmax = float(opts.zrange.split(',')[1]) else: vmin = vmax = None if opts.figsize: opts.figsize = map(float, opts.figsize.split(',')) else: vmin = vmax = None clean = True # change for debug if opts.bam: mreads = path.realpath(opts.bam) if not opts.biases and all(v !='raw' for v in opts.normalizations): raise Exception('ERROR: external BAM input, should provide path to' ' biases file.') biases = opts.biases else: biases, mreads = load_parameters_fromdb(opts) mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) if biases else None if opts.biases: biases = opts.biases coord1 = opts.coord1 coord2 = opts.coord2 if coord2 and not coord1: coord1, coord2 = coord2, coord1 if not coord1: region1 = None start1 = None end1 = None region2 = None start2 = None end2 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None if coord2: try: crm2, pos2 = coord2.split(':') start2, end2 = pos2.split('-') region2 = crm2 start2 = int(start2) end2 = int(end2) except ValueError: region2 = coord2 start2 = None end2 = None else: region2 = None start2 = None end2 = None if opts.plot and not opts.force_plot: if opts.interactive: max_size = 1500**2 else: max_size = 5000**2 else: max_size = None outdir = path.join(opts.workdir, '05_sub-matrices') mkdir(outdir) tmpdir = path.join(opts.workdir, '05_sub-matrices', '_tmp_sub-matrices_%s' % param_hash) mkdir(tmpdir) if region1: if region1: if not opts.quiet: stdout.write('\nExtraction of %s' % (region1)) if start1: if not opts.quiet: stdout.write(':%s-%s' % (start1, end1)) else: if not opts.quiet: stdout.write(' (full chromosome)') if region2: if not opts.quiet: stdout.write(' intersection with %s' % (region2)) if start2: if not opts.quiet: stdout.write(':%s-%s\n' % (start2, end2)) else: if not opts.quiet: stdout.write(' (full chromosome)\n') else: if not opts.quiet: stdout.write('\n') else: if not opts.quiet: stdout.write('\nExtraction of full genome\n') out_files = {} out_plots = {} if opts.matrix or opts.plot: bamfile = AlignmentFile(mreads, 'rb') sections = OrderedDict(zip(bamfile.references, [x for x in bamfile.lengths])) total = 0 section_pos = OrderedDict() for crm in sections: section_pos[crm] = (total, total + sections[crm]) total += sections[crm] for norm in opts.normalizations: norm_string = ('RAW' if norm == 'raw' else 'NRM' if norm == 'norm' else 'DEC') printime('Getting %s matrices' % norm) try: matrix, bads1, bads2, regions, name, bin_coords = get_matrix( mreads, opts.reso, load(open(biases)) if biases and norm != 'raw' else None, normalization=norm, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, ncpus=opts.cpus, return_headers=True, nchunks=opts.nchunks, verbose=not opts.quiet, clean=clean, max_size=max_size) except NotImplementedError: if norm == "raw&decay": warn('WARNING: raw&decay normalization not implemented ' 'for matrices\n... skipping\n') continue raise b1, e1, b2, e2 = bin_coords b1, e1 = 0, e1 - b1 b2, e2 = 0, e2 - b2 if opts.row_names: starts = [start1, start2] ends = [end1, end2] row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions) for p in range(starts[r] if r < len(starts) and starts[r] else 0, ends[r] if r < len(ends) and ends[r] else sections[reg], opts.reso)) if opts.matrix: printime(' - Writing: %s' % norm) fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(opts.reso, sep=''), ('_' + param_hash)) out_files[norm_string] = path.join(outdir, fnam) out = open(path.join(outdir, fnam), 'w') for reg in regions: out.write('# CRM %s\t%d\n' % (reg, sections[reg])) if region2: out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1]))) out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2]))) else: out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1]))) if opts.row_names: out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) + '\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') else: out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0)) for i in xrange(b1, e1)) for j in xrange(b2, e2)) + '\n') out.close() if opts.plot: # transform matrix matrix = array([array([matrix.get((i, j), 0) for i in xrange(b1, e1)]) for j in xrange(b2, e2)]) m = zeros_like(matrix) for bad1 in bads1: m[:,bad1] = 1 for bad2 in bads2: m[bad2,:] = 1 matrix = ma.masked_array(matrix, m) printime(' - Plotting: %s' % norm) fnam = '%s_%s_%s%s%s.%s' % ( norm, name, nicer(opts.reso, sep=''), ('_' + param_hash), '_tri' if opts.triangular else '', opts.format) out_plots[norm_string] = path.join(outdir, fnam) pltbeg1 = 0 if start1 is None else start1 pltend1 = sections[regions[0]] if end1 is None else end1 pltbeg2 = 0 if start2 is None else start2 pltend2 = sections[regions[-1]] if end2 is None else end2 xlabel = '{}:{:,}-{:,}'.format( regions[0], pltbeg1 if pltbeg1 else 1, pltend1) ylabel = '{}:{:,}-{:,}'.format( regions[-1], pltbeg2 if pltbeg2 else 1, pltend2) section_pos = OrderedDict((k, section_pos[k]) for k in section_pos if k in regions) ax1, _ = plot_HiC_matrix( matrix, triangular=opts.triangular, vmin=vmin, vmax=vmax, cmap=opts.cmap, figsize=opts.figsize, bad_color=opts.bad_color if norm != 'raw' else None) ax1.set_title('Region: %s, normalization: %s, resolution: %s' % ( name, norm, nicer(opts.reso)), y=1.05) _format_axes(ax1, start1, end1, start2, end2, opts.reso, regions, section_pos, sections, opts.xtick_rotation, triangular=False) if opts.interactive: plt.show() plt.close('all') else: tadbit_savefig(path.join(outdir, fnam)) if not opts.matrix and not opts.only_plot: printime('Getting and writing matrices') out_files.update(write_matrix( mreads, opts.reso, load(open(biases)) if biases else None, outdir, filter_exclude=opts.filter, normalizations=opts.normalizations, region1=region1, start1=start1, end1=end1, region2=region2, start2=start2, end2=end2, tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus, nchunks=opts.nchunks, verbose=not opts.quiet, extra=param_hash, clean=clean)) if clean: printime('Cleaning') system('rm -rf %s '% tmpdir) if not opts.interactive: printime('Saving to DB') finish_time = time.localtime() save_to_db(opts, launch_time, finish_time, out_files, out_plots)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1) if len(mappability[c]) < len(refs) / opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) / opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % ( nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns( perc_zero=opts.perc_zeros, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) except ValueError: hic_data.filter_columns( perc_zero=100, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) # bad columns bad_columns_file = path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=True, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = open(bias_file, 'w') out_bias.write( '\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: inter_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) inter_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) else: genom_map_nrm_fig = None genom_map_raw_fig = None genom_map_nrm_txt = path.join( opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join( opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, launch_time, finish_time)
def save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time): con = lite.connect(path.join(opts.workdir, 'trace.db')) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='PARSED_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table MAPPED_OUTPUTs (Id integer primary key, PATHid int, BEDid int, Uniquely_mapped int, unique (PATHid, BEDid))""") cur.execute(""" create table PARSED_OUTPUTs (Id integer primary key, PATHid int, Total_interactions int, Multiples int, unique (PATHid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', 'Parse', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, out_file1, 'BED', jobid, opts.workdir) for genome in opts.genome: add_path(cur, genome, 'FASTA', jobid, opts.workdir) if out_file2: add_path(cur, out_file2, 'BED', jobid, opts.workdir) fnames = f_names1, f_names2 outfiles = out_file1, out_file2 for count in counts: try: sum_reads = 0 for i, item in enumerate(counts[count]): cur.execute(""" insert into MAPPED_OUTPUTs (Id , PATHid, BEDid, Uniquely_mapped) values (NULL, %d, %d, %d) """ % (get_path_id(cur, fnames[count][i], opts.workdir), get_path_id(cur, outfiles[count], opts.workdir), counts[count][item])) sum_reads += counts[count][item] except lite.IntegrityError: print 'WARNING: already parsed (MAPPED_OUTPUTs)' try: cur.execute(""" insert into PARSED_OUTPUTs (Id , PATHid, Total_interactions, Multiples) values (NULL, %d, %d, %d) """ % (get_path_id(cur, outfiles[count], opts.workdir), sum_reads, multis[count])) except lite.IntegrityError: print 'WARNING: already parsed (PARSED_OUTPUTs)' print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs')
def save_to_db(opts, dangling_ends, ligated, fig_path, outfiles, launch_time, finish_time): """ write little DB to keep track of processes and options """ if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: # check if table exists cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='MAPPED_INPUTs'""") if not cur.fetchall(): try: cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") except lite.OperationalError: pass # may append when mapped files cleaned cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute(""" create table MAPPED_INPUTs (Id integer primary key, PATHid int, Entries int, Trim text, Frag text, Read int, Enzyme text, Dangling_Ends text, Ligation_Sites text, WRKDIRid int, MAPPED_OUTPUTid int, INDEXid int, unique (PATHid,Entries,Read,Enzyme,WRKDIRid,MAPPED_OUTPUTid,INDEXid))""" ) try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Map', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, opts.workdir, 'WORKDIR', jobid) add_path(cur, opts.fastq, 'MAPPED_FASTQ', jobid, opts.workdir) add_path(cur, opts.index, 'INDEX', jobid, opts.workdir) add_path(cur, fig_path, 'FIGURE', jobid, opts.workdir) for i, (out, num) in enumerate(outfiles): try: window = opts.windows[i] except IndexError: window = opts.windows[-1] except TypeError: window = 'None' add_path(cur, out, '2D_BED' if opts.read == 0 else 'SAM/MAP', jobid, opts.workdir) frag = ('none' if opts.iterative else 'fast_frag' if opts.read == 0 else 'frag' if i == len(outfiles) - 1 else 'full') try: cur.execute(""" insert into MAPPED_INPUTs (Id , PATHid, Entries, Trim, Frag, Read, Enzyme, Dangling_Ends, Ligation_Sites, WRKDIRid, MAPPED_OUTPUTid, INDEXid) values (NULL, %d, %d, '%s', '%s', %d, '%s', '%s', '%s', %d, %d, %d) """ % (get_path_id(cur, opts.fastq, opts.workdir), num, window, frag, opts.read, '-'.join(map(str, opts.renz)), ' '.join( '%s:%.3f%%' % (r, dangling_ends.get(r, float('nan'))) for r in opts.renz), ' '.join( '%s:%.3f%%' % ('-'.join(r), ligated.get(r, float('nan'))) for r in ligated), get_path_id(cur, opts.workdir), get_path_id(cur, out, opts.workdir), get_path_id(cur, opts.index, opts.workdir))) except lite.IntegrityError: pass print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs') print_db(cur, 'JOBs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='PARSED_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table MAPPED_OUTPUTs (Id integer primary key, PATHid int, BEDid int, Uniquely_mapped int, unique (PATHid, BEDid))""") cur.execute(""" create table PARSED_OUTPUTs (Id integer primary key, PATHid int, Total_interactions int, Multiples text, unique (PATHid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', 'Parse', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, out_file1, 'BED', jobid, opts.workdir) for genome in opts.genome: add_path(cur, genome, 'FASTA', jobid, opts.workdir) if out_file2: add_path(cur, out_file2, 'BED', jobid, opts.workdir) fnames = f_names1, f_names2 outfiles = out_file1, out_file2 for count in counts: try: sum_reads = 0 for i, item in enumerate(counts[count]): cur.execute(""" insert into MAPPED_OUTPUTs (Id , PATHid, BEDid, Uniquely_mapped) values (NULL, %d, %d, %d) """ % (get_path_id(cur, fnames[count][i], opts.workdir), get_path_id(cur, outfiles[count], opts.workdir), counts[count][item])) sum_reads += counts[count][item] except lite.IntegrityError: print 'WARNING: already parsed (MAPPED_OUTPUTs)' try: cur.execute(""" insert into PARSED_OUTPUTs (Id , PATHid, Total_interactions, Multiples) values (NULL, %d, %d, '%s') """ % (get_path_id(cur, outfiles[count], opts.workdir), sum_reads, ','.join([':'.join(map(str, (n, multis[count][n]))) for n in multis[count] if n]))) except lite.IntegrityError: print 'WARNING: already parsed (PARSED_OUTPUTs)' print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam, hist_path, median, max_f, mad, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='INTERSECTION_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table INTERSECTION_OUTPUTs (Id integer primary key, PATHid int, Total_interactions int, Multiple_interactions text, Median_fragment_length, MAD_fragment_length, Max_fragment_length, unique (PATHid))""") cur.execute(""" create table FILTER_OUTPUTs (Id integer primary key, PATHid int, Name text, Count int, Applied text, JOBid int, unique (PATHid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', 'Filter', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, mreads, '2D_BED', jobid, opts.workdir) add_path(cur, outbam, 'HIC_BAM', jobid, opts.workdir) add_path(cur, outbam + '.bai', 'HIC_BAI', jobid, opts.workdir) add_path(cur, reads, '2D_BED', jobid, opts.workdir) add_path(cur, hist_path, 'FIGURE', jobid, opts.workdir) try: real_count = count for mult in multiples: real_count = real_count - multiples[mult] + multiples[mult] * ( (mult * (mult + 1)) // 2) cur.execute( """ insert into INTERSECTION_OUTPUTs (Id , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length) values (NULL, %d, %d, '%s', %d, %d, %d) """ % (get_path_id(cur, mreads, opts.workdir), real_count, ' '.join( ['%s:%d' % (k, multiples[k]) for k in sorted(multiples)]), median, mad, max_f)) except lite.IntegrityError: print('WARNING: already filtered') if opts.force: cur.execute( 'delete from INTERSECTION_OUTPUTs where PATHid = %d' % (get_path_id(cur, mreads, opts.workdir))) cur.execute(""" insert into INTERSECTION_OUTPUTs (Id , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length) values (NULL, %d, %d, '%s', %d, %d, %d) """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join( ['%s:%d' % (k, multiples[k]) for k in sorted(multiples)]), median, mad, max_f)) for nf, f in enumerate(masked, 1): try: add_path(cur, masked[f]['fnam'], 'FILTER', jobid, opts.workdir) except KeyError: continue try: cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, Applied, JOBid) values (NULL, %d, '%s', '%s', '%s', %d) """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir), masked[f]['name'], masked[f]['reads'], 'True' if nf in opts.apply else 'False', jobid)) except lite.IntegrityError: print('WARNING: already filtered') if opts.force: cur.execute( 'delete from FILTER_OUTPUTs where PATHid = %d' % (get_path_id(cur, masked[f]['fnam'], opts.workdir))) cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, Applied, JOBid) values (NULL, %d, '%s', '%s', '%s', %d) """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir), masked[f]['name'], masked[f]['reads'], 'True' if nf in opts.apply else 'False', jobid)) try: cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, Applied, JOBid) values (NULL, %d, '%s', '%s', '%s', %d) """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs', n_valid_pairs, '', jobid)) except lite.IntegrityError: print('WARNING: already filtered') if opts.force: cur.execute('delete from FILTER_OUTPUTs where PATHid = %d' % (get_path_id(cur, mreads, opts.workdir))) cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, Applied, JOBid) values (NULL, %d, '%s', '%s', '%s', %d) """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs', n_valid_pairs, '', jobid)) print_db(cur, 'PATHs') if not opts.fast_fragment: print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs') print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'FILTER_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids) renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of cPickle genome to make it faster genome = load(open(opts.genome[0])) except UnpicklingError: genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % ( read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
def save_to_db(opts, cmp_result, tad_result, reso, inputs, launch_time, finish_time): if 'tmp' in opts and opts.tmp: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'wa').close() # tmp file dbfile = opts.tmp copyfile(path.join(opts.workdir, 'trace.db'), dbfile) else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='SEGMENT_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table SEGMENT_OUTPUTs (Id integer primary key, JOBid int, Inputs text, TADs int, Compartments int, Chromosome text, Resolution int)""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Segment', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) for crm in max(cmp_result.keys(), tad_result.keys(), key=lambda x: len(x)): if crm in cmp_result: add_path(cur, cmp_result[crm]['path'], 'COMPARTMENT', jobid, opts.workdir) if crm in tad_result: add_path(cur, tad_result[crm]['path'], 'TAD', jobid, opts.workdir) cur.execute(""" insert into SEGMENT_OUTPUTs (Id , JOBid, Inputs, TADs, Compartments, Chromosome, Resolution) values (NULL, %d, '%s', %d, %d, '%s', %d) """ % (jobid, ','.join([str(i) for i in inputs]), tad_result[crm]['num'] if crm in tad_result else 0, cmp_result[crm]['num'] if crm in cmp_result else 0, crm, reso)) print_db(cur, 'PATHs') print_db(cur, 'JOBs') print_db(cur, 'SEGMENT_OUTPUTs') if 'tmp' in opts and opts.tmp: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock remove(path.join(opts.workdir, '__lock_db'))
def save_to_db(opts, cmp_result, tad_result, reso, inputs, richA_stats, firsts, param_hash, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb copyfile(path.join(opts.workdir, 'trace.db'), dbfile) else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='JOBs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='SEGMENT_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table SEGMENT_OUTPUTs (Id integer primary key, JOBid int, Inputs text, TADs int, Compartments int, richA_corr real, EV_index int, EValue real, Chromosome text, Resolution int)""") try: parameters = digest_parameters(opts, get_md5=False, extra=['fasta']) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Segment', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) for ncrm, crm in enumerate(max(cmp_result.keys(), tad_result.keys(), key=len)): if crm in cmp_result: add_path(cur, cmp_result[crm]['path_cmprt1'], 'COMPARTMENT', jobid, opts.workdir) add_path(cur, cmp_result[crm]['path_cmprt2'], 'COMPARTMENT', jobid, opts.workdir) add_path(cur, cmp_result[crm]['image_cmprt'], 'FIGURE', jobid, opts.workdir) if opts.savecorr: add_path(cur, cmp_result[crm]['path_cormat'], 'CROSS_CORR_MAT', jobid, opts.workdir) if crm in tad_result: add_path(cur, tad_result[crm]['path'], 'TAD', jobid, opts.workdir) if opts.rich_in_A: add_path(cur, opts.rich_in_A, 'BED', jobid, opts.workdir) if crm in firsts: evalue = firsts[crm][0][(opts.ev_index[ncrm] - 1) if opts.ev_index else 0] eindex = opts.ev_index[ncrm] if opts.ev_index else 1 else: evalue = 'NULL' eindex = 'NULL' try: cur.execute(""" insert into SEGMENT_OUTPUTs (Id , JOBid, Inputs, TADs, Compartments, richA_corr, EV_index, EValue, Chromosome, Resolution) values (NULL, %d, '%s', %s, %s, %s, %s, %s, '%s', %d) """ % (jobid, ','.join([str(i) for i in inputs]), tad_result[crm]['num'] if crm in tad_result else 'NULL', cmp_result[crm]['num'] if crm in cmp_result else 'NULL', (richA_stats[crm] if crm in richA_stats and richA_stats[crm] is not None else 'NULL'), eindex, evalue, crm, reso)) except lite.OperationalError: # TODO: remove this print_exc() try: cur.execute("alter table SEGMENT_OUTPUTs add column 'richA_corr' 'real'") except: pass try: cur.execute("alter table SEGMENT_OUTPUTs add column 'EValue' 'real'") except: pass try: cur.execute("alter table SEGMENT_OUTPUTs add column 'EV_index', 'int'") except: pass cur.execute(""" insert into SEGMENT_OUTPUTs (Id , JOBid, Inputs, TADs, Compartments, richA_corr, EV_index, EValue, Chromosome, Resolution) values (NULL, %d, '%s', %d, %d, %s, %s, %s, '%s', %d) """ % (jobid, ','.join([str(i) for i in inputs]), tad_result[crm]['num'] if crm in tad_result else 0, cmp_result[crm]['num'] if crm in cmp_result else 0, (richA_stats[crm] if crm in richA_stats and richA_stats[crm] is not None else 'NULL'), eindex, evalue, crm, reso)) print_db(cur, 'PATHs') print_db(cur, 'JOBs') print_db(cur, 'SEGMENT_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock remove(path.join(opts.workdir, '__lock_db'))
def save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='PARSED_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table MAPPED_OUTPUTs (Id integer primary key, PATHid int, BEDid int, Uniquely_mapped int, unique (PATHid, BEDid))""") cur.execute(""" create table PARSED_OUTPUTs (Id integer primary key, PATHid int, Total_interactions int, Multiples text, unique (PATHid))""") cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='JOBs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', 'Parse', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, out_file1, 'BED', jobid, opts.workdir) for genome in opts.genome: add_path(cur, genome, 'FASTA', jobid, opts.workdir) if out_file2: add_path(cur, out_file2, 'BED', jobid, opts.workdir) fnames = f_names1, f_names2 outfiles = out_file1, out_file2 for count in counts: try: sum_reads = 0 for i, item in enumerate(counts[count]): add_path(cur, fnames[count][i], 'MAPPED_FASTQ', jobid, opts.workdir) cur.execute(""" insert into MAPPED_OUTPUTs (Id , PATHid, BEDid, Uniquely_mapped) values (NULL, %d, %d, %d) """ % (get_path_id(cur, fnames[count][i], opts.workdir), get_path_id(cur, outfiles[count], opts.workdir), counts[count][item])) sum_reads += counts[count][item] except lite.IntegrityError: print('WARNING: already parsed (MAPPED_OUTPUTs)') try: cur.execute(""" insert into PARSED_OUTPUTs (Id , PATHid, Total_interactions, Multiples) values (NULL, %d, %d, '%s') """ % (get_path_id( cur, outfiles[count], opts.workdir), sum_reads, ','.join([ ':'.join(map(str, (n, multis[count][n]))) for n in multis[count] if n ]))) except lite.IntegrityError: print('WARNING: already parsed (PARSED_OUTPUTs)') print_db(cur, 'PATHs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, get_md5=True) if opts.nosql: biases = opts.biases mreads = opts.mreads inputs = [] elif opts.biases or opts.mreads: if not opts.mreads: raise Exception('ERROR: also need to provide BAM file') if not opts.biases: raise Exception('ERROR: also need to provide biases file') biases = opts.biases mreads = opts.mreads inputs = ['NA', 'NA'] mkdir(path.join(opts.workdir)) else: biases, mreads, biases_id, mreads_id = load_parameters_fromdb(opts) inputs = [biases_id, mreads_id] # store path ids to be saved in database mreads = path.join(opts.workdir, mreads) biases = path.join(opts.workdir, biases) reso = opts.reso mkdir(path.join(opts.workdir, '06_segmentation')) print 'loading %s \n at resolution %s' % (mreads, nice(reso)) region = None if opts.crms and len(opts.crms) == 1: region = opts.crms[0] hic_data = load_hic_data_from_bam(mreads, reso, ncpus=opts.cpus, region=region, biases=None if opts.all_bins else biases, filter_exclude=opts.filter) # compartments cmp_result = {} richA_stats = {} firsts = {} if not opts.only_tads: print 'Searching compartments' cmprt_dir = path.join(opts.workdir, '06_segmentation', 'compartments_%s' % (nice(reso))) mkdir(cmprt_dir) if opts.fasta: print ' - Computing GC content to label compartments' rich_in_A = get_gc_content(parse_fasta(opts.fasta, chr_filter=opts.crms), reso, chromosomes=opts.crms, by_chrom=True, n_cpus=opts.cpus) elif opts.rich_in_A: rich_in_A = opts.rich_in_A else: rich_in_A = None n_evs = opts.n_evs if opts.n_evs > 0 else 3 firsts, richA_stats = hic_data.find_compartments( crms=opts.crms, savefig=cmprt_dir, verbose=True, suffix=param_hash, rich_in_A=rich_in_A, show_compartment_labels=rich_in_A is not None, savecorr=cmprt_dir if opts.savecorr else None, max_ev=n_evs, ev_index=opts.ev_index, vmin=None if opts.fix_corr_scale else 'auto', vmax=None if opts.fix_corr_scale else 'auto') for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes): if not crm in firsts: continue ev_file = open(path.join( cmprt_dir, '%s_EigVect%d_%s.tsv' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash)), 'w') ev_file.write('# %s\n' % ('\t'.join( 'EV_%d (%.4f)' % (i, v) for i, v in enumerate(firsts[crm][0], 1)))) ev_file.write('\n'.join(['\t'.join([str(v) for v in vs]) for vs in zip(*firsts[crm][1])])) ev_file.close() for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes): cmprt_file1 = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash)) cmprt_file2 = path.join(cmprt_dir, '%s_EigVect%d_%s.tsv' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash)) cmprt_image = path.join(cmprt_dir, '%s_EV%d_%s.%s' % ( crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash, opts.format)) if opts.savecorr: cormat_file = path.join(cmprt_dir, '%s_corr-matrix%s.tsv' % (crm, param_hash)) else: cormat_file = None hic_data.write_compartments(cmprt_file1, chroms=[crm]) cmp_result[crm] = {'path_cmprt1': cmprt_file1, 'path_cmprt2': cmprt_file2, 'path_cormat': cormat_file, 'image_cmprt': cmprt_image, 'num' : len(hic_data.compartments[crm])} # TADs tad_result = {} if not opts.only_compartments: print 'Searching TADs' tad_dir = path.join(opts.workdir, '06_segmentation', 'tads_%s' % (nice(reso))) mkdir(tad_dir) for crm in hic_data.chromosomes: if opts.crms and not crm in opts.crms: continue print ' - %s' % crm matrix = hic_data.get_matrix(focus=crm) beg, end = hic_data.section_pos[crm] size = len(matrix) if size < 10: print " Chromosome too short (%d bins), skipping..." % size continue # transform bad column in chromosome referential if hic_data.bads: to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)]) else: to_rm = None # maximum size of a TAD max_tad_size = (size - 1) if opts.max_tad_size is None else opts.max_tad_size result = tadbit([matrix], remove=to_rm, n_cpus=opts.cpus, verbose=opts.verbose, max_tad_size=max_tad_size, no_heuristic=False) # use normalization to compute height on TADs called if opts.all_bins: if opts.nosql: biases = load(open(biases)) else: biases = load(open(path.join(opts.workdir, biases))) hic_data.bads = biases['badcol'] hic_data.bias = biases['biases'] tads = load_tad_height(result, size, beg, end, hic_data) table = '' table += '%s\t%s\t%s\t%s\t%s\n' % ('#', 'start', 'end', 'score', 'density') for tad in tads: table += '%s\t%s\t%s\t%s%s\n' % ( tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1), abs(tads[tad]['score']), '\t%s' % (round( float(tads[tad]['height']), 3))) out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash)) out = open(out_tad, 'w') out.write(table) out.close() tad_result[crm] = {'path' : out_tad, 'num': len(tads)} finish_time = time.localtime() if not opts.nosql: try: save_to_db(opts, cmp_result, tad_result, reso, inputs, richA_stats, firsts, param_hash, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print( ' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print( ' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=0.001, max_frag_size=100000, min_frag_size=50, re_proximity=5, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))) if len(fas - bam) <= 50: print('\n'.join([(' - ' + c) for c in (fas - bam)])) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1) if len(mappability[c]) < len(refs) // opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) // opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # pad mappability at the end if the size is close to gc_content if len(mappability)<len(gc_content) and len(mappability)/len(gc_content) > 0.95: mappability += [float('nan')] * (len(gc_content)-len(mappability)) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in range(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1` # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path, cis_limit=opts.cis_limit, trans_limit=opts.trans_limit, min_ratio=opts.ratio_limit, fast_filter=opts.fast_filter) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time): con = lite.connect(path.join(opts.workdir, 'trace.db')) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='PARSED_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table MAPPED_OUTPUTs (Id integer primary key, PATHid int, BEDid int, Uniquely_mapped int, unique (PATHid, BEDid))""") cur.execute(""" create table PARSED_OUTPUTs (Id integer primary key, PATHid int, Total_interactions int, Multiples int, unique (PATHid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', 'Parse', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, out_file1, 'BED', jobid, opts.workdir) for genome in opts.genome: add_path(cur, genome, 'FASTA', jobid, opts.workdir) if out_file2: add_path(cur, out_file2, 'BED', jobid, opts.workdir) fnames = f_names1, f_names2 outfiles = out_file1, out_file2 for count in counts: try: sum_reads = 0 for i, item in enumerate(counts[count]): cur.execute(""" insert into MAPPED_OUTPUTs (Id , PATHid, BEDid, Uniquely_mapped) values (NULL, %d, %d, %d) """ % (get_path_id(cur, fnames[count][i], opts.workdir), get_path_id(cur, outfiles[count], opts.workdir), counts[count][item])) sum_reads += counts[count][item] except lite.IntegrityError: print 'WARNING: already parsed (MAPPED_OUTPUTs)' try: cur.execute(""" insert into PARSED_OUTPUTs (Id , PATHid, Total_interactions, Multiples) values (NULL, %d, %d, %d) """ % (get_path_id(cur, outfiles[count], opts.workdir), sum_reads, multis[count])) except lite.IntegrityError: print 'WARNING: already parsed (PARSED_OUTPUTs)' print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs')
def run(opts): check_options(opts) samtools = which(opts.samtools) launch_time = time.localtime() param_hash = digest_parameters(opts) reso1 = reso2 = None if opts.bam1: mreads1 = path.realpath(opts.bam1) biases1 = opts.biases1 else: biases1, mreads1, reso1 = load_parameters_fromdb( opts.workdir1, opts.jobid1, opts, opts.tmpdb1) mreads1 = path.join(opts.workdir1, mreads1) try: biases1 = path.join(opts.workdir1, biases1) except AttributeError: biases1 = None except TypeError: # Py3 biases1 = None if opts.bam2: mreads2 = path.realpath(opts.bam2) biases2 = opts.biases2 else: biases2, mreads2, reso2 = load_parameters_fromdb( opts.workdir2, opts.jobid2, opts, opts.tmpdb2) mreads2 = path.join(opts.workdir2, mreads2) try: biases2 = path.join(opts.workdir2, biases2) except AttributeError: biases2 = None except TypeError: # Py3 biases1 = None filter_exclude = opts.filter if reso1 != reso2: raise Exception('ERROR: differing resolutions between experiments to ' 'be merged') mkdir(path.join(opts.workdir, '00_merge')) if not opts.skip_comparison: printime(' - loading first sample %s' % (mreads1)) hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) printime(' - loading second sample %s' % (mreads2)) hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2, tmpdir=path.join(opts.workdir, '00_merge'), ncpus=opts.cpus, filter_exclude=filter_exclude) if opts.workdir1 and opts.workdir2: masked1 = {'valid-pairs': {'count': 0}} masked2 = {'valid-pairs': {'count': 0}} else: masked1 = {'valid-pairs': {'count': sum(hic_data1.values())}} masked2 = {'valid-pairs': {'count': sum(hic_data2.values())}} decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash)) eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash)) eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash)) printime(' - comparing experiments') printime(' => correlation between equidistant loci') corr, _, scc, std, bads = correlate_matrices( hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, savefig=decay_corr_fig, savedata=decay_corr_dat, get_bads=True) print(' - correlation score (SCC): %.4f (+- %.7f)' % (scc, std)) printime(' => correlation between eigenvectors') eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm, remove_bad_columns=True, nvect=6, savefig=eigen_corr_fig, savedata=eigen_corr_dat) printime(' => reproducibility score') reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm, verbose=False, remove_bad_columns=True) print(' - reproducibility score: %.4f' % (reprod)) ncols = len(hic_data1) else: ncols = 0 decay_corr_dat = 'None' decay_corr_fig = 'None' eigen_corr_dat = 'None' eigen_corr_fig = 'None' masked1 = {} masked2 = {} corr = eig_corr = scc = std = reprod = 0 bads = {} # merge inputs mkdir(path.join(opts.workdir, '03_filtered_reads')) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) if not opts.skip_merge: outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s.bam' % (param_hash)) printime(' - Mergeing experiments') system(samtools + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2)) printime(' - Indexing new BAM file') # check samtools version number and modify command line version = LooseVersion([l.split()[1] for l in Popen(samtools, stderr=PIPE, universal_newlines=True).communicate()[1].split('\n') if 'Version' in l][0]) if version >= LooseVersion('1.3.1'): system(samtools + ' index -@ %d %s' % (opts.cpus, outbam)) else: system(samtools + ' index %s' % (outbam)) else: outbam = '' finish_time = time.localtime() save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, len(list(bads.keys())), ncols, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr, biases1, biases2, masked1, masked2, launch_time, finish_time) printime('\nDone.')
def save_to_db(opts, launch_time, finish_time, out_files, out_plots): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() try: parameters = digest_parameters(opts, get_md5=False, extra=['quiet']) param_hash = digest_parameters(opts, get_md5=True , extra=['quiet']) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Bin', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass except lite.OperationalError: try: cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") except lite.OperationalError: pass # may append when mapped files cleaned cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Bin', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) jobid = get_jobid(cur) for fnam in out_files: add_path(cur, out_files[fnam], fnam + '_MATRIX', jobid, opts.workdir) for fnam in out_plots: add_path(cur, out_plots[fnam], fnam + '_FIGURE', jobid, opts.workdir) if not opts.quiet: print_db(cur, 'JOBs') print_db(cur, 'PATHs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def save_to_db(opts, bias_file, mreads, bad_col_image, nbad_columns, ncolumns, raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, bam_filter, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='JOBs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='NORMALIZE_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table NORMALIZE_OUTPUTs (Id integer primary key, JOBid int, Input int, N_columns int, N_filtered int, BAM_filter int, Cis_percentage_Raw real, Cis_percentage_Norm real, Slope_700kb_10Mb real, Resolution int, Normalization text, Factor int, unique (JOBid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute( """ insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Normalize', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, bias_file, 'BIASES', jobid, opts.workdir) add_path(cur, bad_col_image, 'FIGURE', jobid, opts.workdir) add_path(cur, inter_vs_gcoord, 'FIGURE', jobid, opts.workdir) if opts.bam: add_path(cur, path.realpath(opts.bam), 'EXT_2D_BAM', jobid, opts.workdir) if opts.mappability: add_path(cur, path.realpath(opts.mappability), 'EXT_MAPPABILITY', jobid, opts.workdir) if opts.fasta: add_path(cur, path.realpath(opts.fasta), 'EXT_FASTA', jobid, opts.workdir) # get pathid of input cur.execute("select id from paths where path = '%s'" % (path.relpath(mreads, opts.workdir))) input_bed = cur.fetchall()[0][0] a2 = 0 if isnan(a2) else a2 try: cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, N_columns, N_filtered, BAM_filter, Cis_percentage_Raw, Cis_percentage_Norm, Slope_700kb_10Mb, Resolution, Normalization, Factor) values (NULL, %d, %d, %d, %d, %d, %f, %f, %f, %d, '%s', %f) """ % (jobid, input_bed, ncolumns, nbad_columns, bam_filter, 100 * raw_cisprc, 100 * norm_cisprc, a2, opts.reso, opts.normalization, opts.factor)) except lite.OperationalError: try: cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, N_columns, N_filtered, BAM_filter, Cis_percentage_Raw, Cis_percentage_Norm, Slope_700kb_10Mb, Resolution, Normalization, Factor) values (NULL, %d, %d, %d, %d, %d, %f, %f, %f, %d, '%s', %f) """ % (jobid, input_bed, ncolumns, nbad_columns, bam_filter, 100 * raw_cisprc, 100 * norm_cisprc, a2, opts.reso, opts.normalization, opts.factor)) except lite.OperationalError: print 'WANRING: Normalized table not written!!!' print_db(cur, 'PATHs') print_db(cur, 'JOBs') try: print_db(cur, 'FILTER_OUTPUTs') print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'FILTER_OUTPUTs') except lite.OperationalError: pass print_db(cur, 'NORMALIZE_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig, nbad_columns, ncolumns, scc, std, reprod, eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr, biases1, biases2, masked1, masked2, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='MERGE_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute(""" create table FILTER_OUTPUTs (Id integer primary key, PATHid int, Name text, Count int, JOBid int, unique (PATHid))""") cur.execute(""" create table MERGE_OUTPUTs (Id integer primary key, JOBid int, Wrkd1Path int, Wrkd2Path int, Bed1Path int, Bed2Path int, MergePath int, unique (JOBid))""") cur.execute(""" create table MERGE_STATs (Id integer primary key, JOBid int, Inputs text, decay_corr text, eigen_corr text, reprod real, scc real, std_scc real, N_columns int, N_filtered int, Resolution int, bias1Path int, bias2Path int, unique (JOBid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Merge', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, decay_corr_dat, 'CORR' , jobid, opts.workdir) add_path(cur, decay_corr_fig, 'FIGURE' , jobid, opts.workdir) add_path(cur, eigen_corr_dat, 'CORR' , jobid, opts.workdir) add_path(cur, eigen_corr_fig, 'FIGURE' , jobid, opts.workdir) add_path(cur, opts.workdir , 'WORKDIR' , jobid) add_path(cur, opts.workdir1, 'WORKDIR1' , jobid, opts.workdir) add_path(cur, opts.workdir2, 'WORKDIR2' , jobid, opts.workdir) add_path(cur, mreads1 , 'EXT_HIC_BAM', jobid, opts.workdir) add_path(cur, mreads2 , 'EXT_HIC_BAM', jobid, opts.workdir) if not opts.skip_merge: add_path(cur, outbed , 'HIC_BAM' , jobid, opts.workdir) if opts.norm: add_path(cur, biases1 , 'BIASES' , jobid, opts.workdir) add_path(cur, biases2 , 'BIASES' , jobid, opts.workdir) biasid1 = get_path_id(cur, biases1, opts.workdir) biasid2 = get_path_id(cur, biases2, opts.workdir) else: biasid1 = 0 biasid2 = 0 cur.execute("select id from paths where path = '%s'" % ( path.relpath(mreads1, opts.workdir))) bed1 = cur.fetchall()[0][0] if opts.workdir1: cur.execute("select id from paths where path = '%s'" % ( path.relpath(opts.workdir1, opts.workdir))) w1path = cur.fetchall()[0][0] else: w1path = 0 cur.execute("select id from paths where path = '%s'" % ( path.relpath(mreads2, opts.workdir))) bed2 = cur.fetchall()[0][0] if opts.workdir2: cur.execute("select id from paths where path = '%s'" % ( path.relpath(opts.workdir2, opts.workdir))) w2path = cur.fetchall()[0][0] else: w2path = 0 if not opts.skip_merge: cur.execute("select id from paths where path = '%s'" % ( path.relpath(outbed, opts.workdir))) outbedid = cur.fetchall()[0][0] if not opts.skip_comparison: decay_corr = '-'.join(['%.1f' % (v) for v in corr[:10:2]]).replace('0.', '.') eigen_corr = '-'.join(['%.2f' % (max(v)) for v in eig_corr[:4]]).replace('0.', '.') else: decay_corr = eigen_corr = None if not opts.skip_merge: cur.execute(""" insert into MERGE_OUTPUTs (Id , JOBid, Wrkd1Path, Wrkd2Path, Bed1Path, Bed2Path, MergePath) values (NULL, %d, %d, %d, %d, %d, %d) """ % (jobid, w1path, w2path, bed1, bed2, outbedid)) if not opts.skip_comparison: cur.execute(""" insert into MERGE_STATs (Id , JOBid, N_columns, N_filtered, Resolution, reprod, scc, std_scc, decay_corr, eigen_corr, bias1Path, bias2Path) values (NULL, %d, %d, %d, %d, %f, %f, %f, '%s', '%s', %d, %d) """ % (jobid, ncolumns, nbad_columns, opts.reso , reprod, scc, std, decay_corr, eigen_corr, biasid1, biasid2)) if opts.workdir1: if 'tmpdb' in opts and opts.tmpdb: # tmp file dbfile1 = opts.tmpdb1 try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir1, 'trace.db'), dbfile1) except IOError: pass else: dbfile1 = path.join(opts.workdir1, 'trace.db') tmpcon = lite.connect(dbfile1) with tmpcon: tmpcur = tmpcon.cursor() tmpcur.execute("select Name, PATHid, Count from filter_outputs") for name, pathid, count in tmpcur.fetchall(): res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid)) tmppath = res.fetchall()[0][0] masked1[name] = {'path': tmppath, 'count': count} if 'tmpdb' in opts and opts.tmpdb: remove(dbfile1) if opts.workdir2: if 'tmpdb' in opts and opts.tmpdb: # tmp file dbfile2 = opts.tmpdb2 try: # to copy in case read2 was already mapped for example copyfile(path.join(opts.workdir2, 'trace.db'), dbfile2) except IOError: pass else: dbfile2 = path.join(opts.workdir2, 'trace.db') tmpcon = lite.connect(dbfile2) with tmpcon: tmpcur = tmpcon.cursor() tmpcur.execute("select Name, PATHid, Count from filter_outputs") for name, pathid, count in tmpcur.fetchall(): res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid)) tmppath = res.fetchall()[0][0] masked2[name] = {'path': tmppath, 'count': count} if 'tmpdb' in opts and opts.tmpdb: remove(dbfile2) for f in masked1: if f != 'valid-pairs': outmask = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv_%s.tsv' % ( param_hash, f)) out = open(outmask, 'w') try: fh = magic_open(path.join(opts.workdir1, masked1[f]['path'])) except FileNotFoundError: fh = magic_open(path.join(opts.workdir1, masked1[f]['path'] + '.gz')) for line in fh: out.write(line) try: fh = magic_open(path.join(opts.workdir2, masked2[f]['path'])) except FileNotFoundError: fh = magic_open(path.join(opts.workdir2, masked2[f]['path'] + '.gz')) for line in fh: out.write(line) add_path(cur, outmask, 'FILTER', jobid, opts.workdir) else: if opts.skip_merge: outmask = 'NA' else: outmask = outbed try: path_id = get_path_id(cur, outmask, opts.workdir) except IndexError: path_id = -1 cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (path_id, f, masked1[f]['count'] + masked2[f]['count'], jobid)) print_db(cur, 'PATHs') print_db(cur, 'JOBs') print_db(cur, 'MERGE_OUTPUTs') print_db(cur, 'MERGE_STATs') print_db(cur, 'FILTER_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts, extra=['quiet']) coord1 = opts.coord1 if not coord1: region1 = None start1 = None end1 = None else: try: crm1, pos1 = coord1.split(':') start1, end1 = pos1.split('-') region1 = crm1 start1 = int(start1) end1 = int(end1) except ValueError: region1 = coord1 start1 = None end1 = None printime('Importing hic in %s format' % opts.format) if opts.format == 'matrix' or opts.format == 'text': with gzopen(opts.input) as f_thing: masked, chroms_gen, crm, beg, _, _ = read_file_header(f_thing) if not chroms_gen or (region1 and region1 not in chroms_gen): raise Exception( '''ERROR: Chromosome size not included in import file. Please include the chromosome sizes of the data that you want to import in the header of the file. Example: # CRM chr1 249250621''') elif opts.format == 'cooler': if is_cooler(opts.input, opts.reso if opts.reso > 1 else None): chroms_gen = parse_header(opts.input, opts.reso if opts.reso > 1 else None) if not chroms_gen or (region1 and region1 not in chroms_gen): raise Exception( '''ERROR: Chromosome size not included in import file. ''') else: raise Exception('''ERROR: The input file is not a cooler''') chroms = OrderedDict( (crm, int(chroms_gen[crm] // opts.reso) + 1) for crm in chroms_gen) sections = [] if not region1: size = 0 for crm in chroms: size += chroms[crm] sections.extend([(crm, i) for i in range(chroms[crm])]) elif not start1: size = chroms[region1] sections.extend([(region1, i) for i in range(size)]) else: #size = (end1 - start1)//opts.reso size = chroms[region1] sections.extend([ (region1, i) for i in range(start1 // opts.reso, (end1 // opts.reso)) ]) dict_sec = dict([(j, i) for i, j in enumerate(sections)]) bias_file = None badcol = {} if opts.format == 'text': with gzopen(opts.input) as f_thing: matrix = abc_reader(f_thing, size, start1 // opts.reso if start1 else None) size_mat = size elif opts.format == 'matrix': with gzopen(opts.input) as in_f: matrix, size_mat, _, masked, _ = autoreader(in_f) if size != size_mat: raise Exception('''ERROR: The size of the specified region is different from the data in the matrix''') elif opts.format == 'cooler': matrix, weights, size, header = parse_cooler( opts.input, opts.reso if opts.reso > 1 else None, normalized=True, raw_values=True) masked = {} size_mat = size if len(set(weights)) > 1: printime('Transforming cooler weights to biases') outdir_norm = path.join(opts.workdir, '04_normalization') mkdir(outdir_norm) bias_file = path.join( outdir_norm, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') badcol.update((i, True) for i, m in enumerate(weights) if m == 0) dump( { 'biases': dict((k, b if b > 0 else float('nan')) for k, b in enumerate(weights)), 'decay': {}, 'badcol': badcol, 'resolution': opts.reso }, out, HIGHEST_PROTOCOL) out.close() hic = HiC_data(matrix, size_mat, dict_sec=dict_sec, chromosomes=chroms, masked=masked, resolution=opts.reso) #from pytadbit.mapping.analyze import hic_map #hic_map(hic, normalized=False, focus='chr1', show=True, cmap='viridis') printime('Creating BAM file') outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) total_counts = create_BAMhic(hic, opts.cpus, outbam, chroms_gen, opts.reso, samtools=opts.samtools) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, total_counts, size_mat, bias_file, len(badcol), outbam + '.bam', launch_time, finish_time)
def save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, nbad_columns, ncolumns, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, pickle_path, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='NORMALIZE_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table NORMALIZE_OUTPUTs (Id integer primary key, JOBid int, Input int, N_columns int, N_filtered int, CisTrans_nrm_all real, CisTrans_nrm_out real, CisTrans_raw_all real, CisTrans_raw_out real, Slope_700kb_10Mb real, Resolution int, Factor int, unique (JOBid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute( """ insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Normalize', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, pickle_path, 'PICKLE', jobid, opts.workdir) add_path(cur, bad_columns_file, 'BAD_COLUMNS', jobid, opts.workdir) add_path(cur, bias_file, 'BIASES', jobid, opts.workdir) add_path(cur, inter_vs_gcoord, 'FIGURE', jobid, opts.workdir) add_path(cur, mreads, '2D_BED', jobid, opts.workdir) # get pathid of input cur.execute("select id from paths where path = '%s'" % (path.relpath(mreads, opts.workdir))) input_bed = cur.fetchall()[0][0] if intra_dir_nrm_fig: add_path(cur, intra_dir_nrm_fig, 'FIGURES', jobid, opts.workdir) if intra_dir_nrm_fig: add_path(cur, intra_dir_nrm_txt, 'NRM_MATRICES', jobid, opts.workdir) if inter_dir_nrm_fig: add_path(cur, inter_dir_nrm_fig, 'FIGURES', jobid, opts.workdir) if inter_dir_nrm_fig: add_path(cur, inter_dir_nrm_txt, 'NRM_MATRICES', jobid, opts.workdir) if genom_map_nrm_fig: add_path(cur, genom_map_nrm_fig, 'FIGURE', jobid, opts.workdir) if genom_map_nrm_txt: add_path(cur, genom_map_nrm_txt, 'NRM_MATRIX', jobid, opts.workdir) if intra_dir_raw_fig: add_path(cur, intra_dir_raw_fig, 'FIGURES', jobid, opts.workdir) if intra_dir_raw_fig: add_path(cur, intra_dir_raw_txt, 'RAW_MATRICES', jobid, opts.workdir) if inter_dir_raw_fig: add_path(cur, inter_dir_raw_fig, 'FIGURES', jobid, opts.workdir) if inter_dir_raw_fig: add_path(cur, inter_dir_raw_txt, 'RAW_MATRICES', jobid, opts.workdir) if genom_map_raw_fig: add_path(cur, genom_map_raw_fig, 'FIGURE', jobid, opts.workdir) if genom_map_raw_txt: add_path(cur, genom_map_raw_txt, 'RAW_MATRIX', jobid, opts.workdir) try: cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, N_columns, N_filtered, CisTrans_nrm_all, CisTrans_nrm_out, CisTrans_raw_all, CisTrans_raw_out, Slope_700kb_10Mb, Resolution, Factor) values (NULL, %d, %d, %d, %d, %f, %f, %f, %f, %f, %d, %f) """ % (jobid, input_bed, ncolumns, nbad_columns, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, opts.reso, opts.factor)) except lite.OperationalError: try: cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, N_columns, N_filtered, CisTrans_raw_all, CisTrans_raw_out, Slope_700kb_10Mb, Resolution, Factor) values (NULL, %d, %d, %d, %d, %f, %f, %f, %d, %f) """ % (jobid, input_bed, ncolumns, nbad_columns, cis_trans_n_D, cis_trans_n_d, a2, opts.reso, opts.factor)) except lite.OperationalError: print 'WANRING: Normalized table not written!!!' print_db(cur, 'PATHs') print_db(cur, 'JOBs') try: print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') except lite.OperationalError: pass print_db(cur, 'FILTER_OUTPUTs') print_db(cur, 'NORMALIZE_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass
def save_to_db(opts, outfiles, launch_time, finish_time): # write little DB to keep track of processes and options con = lite.connect(path.join(opts.workdir, 'trace.db')) with con: # check if table exists cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='MAPPED_INPUTs'""") if not cur.fetchall(): cur.execute(""" create table PATHs (Id integer primary key, JOBid int, Path text, Type text, unique (Path))""") cur.execute(""" create table JOBs (Id integer primary key, Parameters text, Launch_time text, Finish_time text, Type text, Parameters_md5 text, unique (Parameters_md5))""") cur.execute(""" create table MAPPED_INPUTs (Id integer primary key, PATHid int, Entries int, Trim text, Frag text, Read int, Enzyme text, WRKDIRid int, MAPPED_OUTPUTid int, INDEXid int, unique (PATHid,Entries,Read,Enzyme,WRKDIRid,MAPPED_OUTPUTid,INDEXid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Map', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, opts.workdir, 'WORKDIR', jobid) add_path(cur, opts.fastq , 'MAPPED_FASTQ' , jobid, opts.workdir) add_path(cur, opts.index , 'INDEX' , jobid, opts.workdir) for i, (out, num) in enumerate(outfiles): try: window = opts.windows[i] except IndexError: window = opts.windows[-1] except TypeError: window = 'None' add_path(cur, out, 'SAM/MAP', jobid, opts.workdir) frag = ('none' if opts.iterative else 'frag' if i==len(outfiles) - 1 else 'full') try: cur.execute(""" insert into MAPPED_INPUTs (Id , PATHid, Entries, Trim, Frag, Read, Enzyme, WRKDIRid, MAPPED_OUTPUTid, INDEXid) values (NULL, %d, %d, '%s', '%s', %d, '%s', %d, %d, %d) """ % (get_path_id(cur, opts.fastq, opts.workdir), num, window, frag, opts.read, opts.renz, get_path_id(cur, opts.workdir), get_path_id(cur, out, opts.workdir), get_path_id(cur, opts.index, opts.workdir))) except lite.IntegrityError: pass print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs' ) print_db(cur, 'JOBs' )
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception( 'ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception( 'ERROR: missing restriction enzyme name for oneD normalization' ) if not opts.mappability: raise Exception( 'ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % ( len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception( 'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception( "ERROR: chromosomes in FASTA different the ones in BAM") # get mappability ~2 min printime(' - Parsing mappability') fh = open(opts.mappability) mappability = dict((c, []) for c in refs) line = fh.next() crmM, begM, endM, val = line.split() crm = crmM if crmM not in mappability: print(' skipping %s' % crmM) while crmM not in mappability: line = fh.next() crmM, begM, endM, val = line.split() crm = crmM while any(not mappability[c] for c in mappability): for begB in xrange(0, len(genome[crmM]), opts.reso): endB = begB + opts.reso tmp = 0 try: while True: crmM, begM, endM, val = line.split() if crm != crmM: try: while crmM not in refs: line = fh.next() crmM, _ = line.split('\t', 1) except StopIteration: pass break begM = int(begM) endM = int(endM) if endM > endB: weight = endB - begM if weight >= 0: tmp += weight * float(val) break weight = endM - (begM if begM > begB else begB) if weight < 0: break tmp += weight * float(val) line = fh.next() except StopIteration: pass mappability[crm].append(tmp / opts.reso) crm = crmM mappability = reduce(lambda x, y: x + y, (mappability[c] for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos - 200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols) bad_col_image = path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % (opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print(' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join( outdir, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': opts.reso }, out) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time): con = lite.connect(path.join(opts.workdir, 'trace.db')) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='INTERSECTION_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table INTERSECTION_OUTPUTs (Id integer primary key, PATHid int, Total_interactions int, Multiple_interactions text, Median_fragment_length, MAD_fragment_length, Max_fragment_length, unique (PATHid))""") cur.execute(""" create table FILTER_OUTPUTs (Id integer primary key, PATHid int, Name text, Count int, JOBid int, unique (PATHid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type, Parameters_md5) values (NULL, '%s', '%s', '%s', 'Filter', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, mreads, '2D_BED', jobid, opts.workdir) add_path(cur, reads, '2D_BED', jobid, opts.workdir) add_path(cur, hist_path, 'FIGURE', jobid, opts.workdir) try: cur.execute(""" insert into INTERSECTION_OUTPUTs (Id , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length) values (NULL, %d, %d, '%s', %d, %d, %d) """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join(['%s:%d' % (k, multiples[k]) for k in sorted(multiples)]), median, mad, max_f)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute( 'delete from INTERSECTION_OUTPUTs where PATHid = %d' % ( get_path_id(cur, mreads, opts.workdir))) cur.execute(""" insert into INTERSECTION_OUTPUTs (Id , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length) values (NULL, %d, %d, '%s', %d, %d, %d) """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join(['%s:%d' % (k, multiples[k]) for k in sorted(multiples)]), median, mad, max_f)) for f in masked: add_path(cur, masked[f]['fnam'], 'FILTER', jobid, opts.workdir) try: cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir), masked[f]['name'], masked[f]['reads'], jobid)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute( 'delete from FILTER_OUTPUTs where PATHid = %d' % ( get_path_id(cur, masked[f]['fnam'], opts.workdir))) cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir), masked[f]['name'], masked[f]['reads'], jobid)) try: cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs', n_valid_pairs, jobid)) except lite.IntegrityError: print 'WARNING: already filtered' if opts.force: cur.execute( 'delete from FILTER_OUTPUTs where PATHid = %d' % ( get_path_id(cur, mreads, opts.workdir))) cur.execute(""" insert into FILTER_OUTPUTs (Id , PATHid, Name, Count, JOBid) values (NULL, %d, '%s', '%s', %d) """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs', n_valid_pairs, jobid)) print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs') print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'FILTER_OUTPUTs')
def save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, launch_time, finish_time): con = lite.connect(path.join(opts.workdir, 'trace.db')) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='NORMALIZE_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table NORMALIZE_OUTPUTs (Id integer primary key, JOBid int, Input int, CisTrans_nrm_all real, CisTrans_nrm_out real, CisTrans_raw_all real, CisTrans_raw_out real, Slope_700kb_10Mb real, Resolution int, Factor int, unique (JOBid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True) cur.execute( """ insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Normalize', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, bad_columns_file, 'BAD_COLUMNS', jobid, opts.workdir) add_path(cur, bias_file, 'BIASES', jobid, opts.workdir) add_path(cur, inter_vs_gcoord, 'FIGURE', jobid, opts.workdir) add_path(cur, mreads, '2D_BED', jobid, opts.workdir) # get pathid of input cur.execute("select id from paths where path = '%s'" % (path.relpath(mreads, opts.workdir))) input_bed = cur.fetchall()[0][0] if intra_dir_nrm_fig: add_path(cur, intra_dir_nrm_fig, 'FIGURES', jobid, opts.workdir) if intra_dir_nrm_fig: add_path(cur, intra_dir_nrm_txt, 'NRM_MATRICES', jobid, opts.workdir) if inter_dir_nrm_fig: add_path(cur, inter_dir_nrm_fig, 'FIGURES', jobid, opts.workdir) if inter_dir_nrm_fig: add_path(cur, inter_dir_nrm_txt, 'NRM_MATRICES', jobid, opts.workdir) if genom_map_nrm_fig: add_path(cur, genom_map_nrm_fig, 'FIGURE', jobid, opts.workdir) if genom_map_nrm_txt: add_path(cur, genom_map_nrm_txt, 'NRM_MATRIX', jobid, opts.workdir) if intra_dir_raw_fig: add_path(cur, intra_dir_raw_fig, 'FIGURES', jobid, opts.workdir) if intra_dir_raw_fig: add_path(cur, intra_dir_raw_txt, 'RAW_MATRICES', jobid, opts.workdir) if inter_dir_raw_fig: add_path(cur, inter_dir_raw_fig, 'FIGURES', jobid, opts.workdir) if inter_dir_raw_fig: add_path(cur, inter_dir_raw_txt, 'RAW_MATRICES', jobid, opts.workdir) if genom_map_raw_fig: add_path(cur, genom_map_raw_fig, 'FIGURE', jobid, opts.workdir) if genom_map_raw_txt: add_path(cur, genom_map_raw_txt, 'RAW_MATRIX', jobid, opts.workdir) cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, CisTrans_nrm_all, CisTrans_nrm_out, CisTrans_raw_all, CisTrans_raw_out, Slope_700kb_10Mb, Resolution, Factor) values (NULL, %d, %d, %f, %f, %f, %f, %f, %d, %f) """ % (jobid, input_bed, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, opts.reso, opts.factor)) print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'PATHs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') print_db(cur, 'JOBs') print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'FILTER_OUTPUTs') print_db(cur, 'NORMALIZE_OUTPUTs')
def save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, nbad_columns, ncolumns, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, pickle_path, launch_time, finish_time): if 'tmpdb' in opts and opts.tmpdb: # check lock while path.exists(path.join(opts.workdir, '__lock_db')): time.sleep(0.5) # close lock open(path.join(opts.workdir, '__lock_db'), 'a').close() # tmp file dbfile = opts.tmpdb try: # to copy in case read1 was already mapped for example copyfile(path.join(opts.workdir, 'trace.db'), dbfile) except IOError: pass else: dbfile = path.join(opts.workdir, 'trace.db') con = lite.connect(dbfile) with con: cur = con.cursor() cur.execute("""SELECT name FROM sqlite_master WHERE type='table' AND name='NORMALIZE_OUTPUTs'""") if not cur.fetchall(): cur.execute(""" create table NORMALIZE_OUTPUTs (Id integer primary key, JOBid int, Input int, N_columns int, N_filtered int, CisTrans_nrm_all real, CisTrans_nrm_out real, CisTrans_raw_all real, CisTrans_raw_out real, Slope_700kb_10Mb real, Resolution int, Factor int, unique (JOBid))""") try: parameters = digest_parameters(opts, get_md5=False) param_hash = digest_parameters(opts, get_md5=True ) cur.execute(""" insert into JOBs (Id , Parameters, Launch_time, Finish_time, Type , Parameters_md5) values (NULL, '%s', '%s', '%s', 'Normalize', '%s') """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time), time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash)) except lite.IntegrityError: pass jobid = get_jobid(cur) add_path(cur, pickle_path , 'PICKLE' , jobid, opts.workdir) add_path(cur, bad_columns_file, 'BAD_COLUMNS', jobid, opts.workdir) add_path(cur, bias_file , 'BIASES' , jobid, opts.workdir) add_path(cur, inter_vs_gcoord , 'FIGURE' , jobid, opts.workdir) add_path(cur, mreads , '2D_BED' , jobid, opts.workdir) # get pathid of input cur.execute("select id from paths where path = '%s'" % (path.relpath(mreads, opts.workdir))) input_bed = cur.fetchall()[0][0] if intra_dir_nrm_fig: add_path(cur, intra_dir_nrm_fig, 'FIGURES', jobid, opts.workdir) if intra_dir_nrm_fig: add_path(cur, intra_dir_nrm_txt, 'NRM_MATRICES', jobid, opts.workdir) if inter_dir_nrm_fig: add_path(cur, inter_dir_nrm_fig, 'FIGURES', jobid, opts.workdir) if inter_dir_nrm_fig: add_path(cur, inter_dir_nrm_txt, 'NRM_MATRICES', jobid, opts.workdir) if genom_map_nrm_fig: add_path(cur, genom_map_nrm_fig, 'FIGURE', jobid, opts.workdir) if genom_map_nrm_txt: add_path(cur, genom_map_nrm_txt, 'NRM_MATRIX', jobid, opts.workdir) if intra_dir_raw_fig: add_path(cur, intra_dir_raw_fig, 'FIGURES', jobid, opts.workdir) if intra_dir_raw_fig: add_path(cur, intra_dir_raw_txt, 'RAW_MATRICES', jobid, opts.workdir) if inter_dir_raw_fig: add_path(cur, inter_dir_raw_fig, 'FIGURES', jobid, opts.workdir) if inter_dir_raw_fig: add_path(cur, inter_dir_raw_txt, 'RAW_MATRICES', jobid, opts.workdir) if genom_map_raw_fig: add_path(cur, genom_map_raw_fig, 'FIGURE', jobid, opts.workdir) if genom_map_raw_txt: add_path(cur, genom_map_raw_txt, 'RAW_MATRIX', jobid, opts.workdir) try: cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, N_columns, N_filtered, CisTrans_nrm_all, CisTrans_nrm_out, CisTrans_raw_all, CisTrans_raw_out, Slope_700kb_10Mb, Resolution, Factor) values (NULL, %d, %d, %d, %d, %f, %f, %f, %f, %f, %d, %f) """ % (jobid, input_bed, ncolumns, nbad_columns, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, opts.reso, opts.factor)) except lite.OperationalError: try: cur.execute(""" insert into NORMALIZE_OUTPUTs (Id , JOBid, Input, N_columns, N_filtered, CisTrans_raw_all, CisTrans_raw_out, Slope_700kb_10Mb, Resolution, Factor) values (NULL, %d, %d, %d, %d, %f, %f, %f, %d, %f) """ % (jobid, input_bed, ncolumns, nbad_columns, cis_trans_n_D, cis_trans_n_d, a2, opts.reso, opts.factor)) except lite.OperationalError: print 'WANRING: Normalized table not written!!!' print_db(cur, 'PATHs') print_db(cur, 'JOBs') try: print_db(cur, 'INTERSECTION_OUTPUTs') print_db(cur, 'MAPPED_INPUTs') print_db(cur, 'MAPPED_OUTPUTs') print_db(cur, 'PARSED_OUTPUTs') except lite.OperationalError: pass print_db(cur, 'FILTER_OUTPUTs') print_db(cur, 'NORMALIZE_OUTPUTs') if 'tmpdb' in opts and opts.tmpdb: # copy back file copyfile(dbfile, path.join(opts.workdir, 'trace.db')) remove(dbfile) # release lock try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass