def test_19_matrix_manip(self): if ONLY and not "19" in ONLY: return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads("lala-map~", resolution=10000) hic_map(hic_data1, savedata="lala-map.tsv~", savefig="lala.pdf") hic_map(hic_data1, by_chrom="intra", savedata="lala-maps~", savefig="lalalo~") hic_map(hic_data1, by_chrom="inter", savedata="lala-maps~", savefig="lalala~") # slowest part of the all test: hic_data2 = read_matrix("lala-map.tsv~", resolution=10000) self.assertEqual(hic_data1, hic_data2) # vals = plot_distance_vs_interactions(hic_data1) # self.assertEqual([round(i, 2) if str(i)!="nan" else 0.0 for i in # reduce(lambda x, y: x + y, vals)], # [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0]) a, b = insert_sizes("lala-map~") self.assertEqual([int(a), int(b)], [43, 1033]) hic_data1 = read_matrix(PATH + "/20Kb/chrT/chrT_A.tsv", resolution=20000) hic_data2 = read_matrix(PATH + "/20Kb/chrT/chrT_B.tsv", resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i, 3) for i in corr[0]] self.assertEqual(corr, [ 0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832 ]) ecorr = eig_correlate_matrices(hic_data1, hic_data2, savefig='lala3.pdf') ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)] self.assertEqual(ecorr, [ 0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89 ]) system("rm -rf lala*") if CHKTIME: self.assertEqual(True, True) print "19", time() - t0
def test_19_matrix_manip(self): if ONLY and ONLY != '19': return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000) hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~') hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~') hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~') # slowest part of the all test: hic_data2 = read_matrix('lala-map.tsv~', resolution=10000) self.assertEqual(hic_data1, hic_data2) vals = plot_distance_vs_interactions(hic_data1) self.assertEqual([ round(i, 2) if str(i) != 'nan' else 0.0 for i in reduce(lambda x, y: x + y, vals) ], [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0]) a, b = insert_sizes('lala-map~') self.assertEqual([int(a), int(b)], [43, 1033]) hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000) hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i, 3) for i in corr[0]] self.assertEqual(corr, [ 0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832 ]) ecorr = eig_correlate_matrices(hic_data1, hic_data2) ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)] self.assertEqual(ecorr, [ 0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89 ]) system('rm -rf lala*') if CHKTIME: self.assertEqual(True, True) print '19', time() - t0
def test_19_matrix_manip(self): if ONLY and ONLY != '19': return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000) hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~') hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~') hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~') # slowest part of the all test: hic_data2 = read_matrix('lala-map.tsv~', resolution=10000) self.assertEqual(hic_data1, hic_data2) vals = plot_distance_vs_interactions(hic_data1) self.assertEqual([round(i, 2) if str(i)!='nan' else 0.0 for i in reduce(lambda x, y: x + y, vals)], [-1.74, 4.2, 0.52, 1.82, -0.44, 0.0, -0.5, 2.95, 0.0]) a, b = insert_sizes('lala-map~') self.assertEqual([int(a),int(b)], [43, 1033]) hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000) hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i,3) for i in corr[0]] self.assertEqual(corr, [0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832]) ecorr = eig_correlate_matrices(hic_data1, hic_data2) ecorr = [round(i,3) for i in reduce(lambda x, y:x+y, ecorr)] self.assertEqual(ecorr, [0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89]) system('rm -rf lala*') if CHKTIME: self.assertEqual(True, True) print '19', time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns( perc_zero=opts.perc_zeros, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) except ValueError: hic_data.filter_columns( perc_zero=100, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) # bad columns bad_columns_file = path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=True, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = open(bias_file, 'w') out_bias.write( '\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: inter_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) inter_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) else: genom_map_nrm_fig = None genom_map_raw_fig = None genom_map_nrm_txt = path.join( opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join( opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, launch_time, finish_time)
plt.rcParams['savefig.bbox'] = 'tight' bin_size = {} bin_size['Mb'] = 1e6 pair_id = filtered_reads.split("/")[-1].replace("_filtered_map.tsv", "") outfile = '%s/%s_plot_genomic_coverage_filtered_%s.png' % (POSTFILTERING_PLOTS, pair_id, genomic_coverage_resolution) plt.rcParams['font.size'] = 20 coverages = plot_genomic_distribution(filtered_reads, name='filtered', savefig=outfile, resolution=genomic_coverage_resolution, pair_id=pair_id) outfile = '%s/%s_genomic_coverage_filtered_%s.bed' % (COVERAGES, pair_id, genomic_coverage_resolution) coverages.to_csv(outfile, sep='\t', index=False) # Filtered reads: interaction matrix outfile = '%s/%s_plot_interaction_matrix_filtered_%s.png' % (POSTFILTERING_PLOTS, pair_id, genomic_coverage_resolution) plt.rcParams['font.size'] = 12 hic_map(filtered_reads, resolution=int(bin_size[genomic_coverage_resolution]), savefig=outfile, decay=False, cmap='jet') # Dangling ends: sequencing coverage along chromosomes outfile = '%s/%s_plot_genomic_coverage_dangling_ends_%s.png' % (POSTFILTERING_PLOTS, pair_id, genomic_coverage_resolution) plt.rcParams['font.size'] = 20 coverages = plot_genomic_distribution(dangling_ends, name='dangling_ends', savefig=outfile, resolution=genomic_coverage_resolution, pair_id=pair_id) outfile = '%s/%s_genomic_coverage_dangling_ends_%s.bed' % (COVERAGES, pair_id, genomic_coverage_resolution) coverages.to_csv(outfile, sep='\t', index=False) # self-circle ends: sequencing coverage along chromosomes outfile = '%s/%s_plot_genomic_coverage_self_circle_%s.png' % (POSTFILTERING_PLOTS, pair_id, genomic_coverage_resolution) plt.rcParams['font.size'] = 20 coverages = plot_genomic_distribution(self_circle, name='self_circle', savefig=outfile, resolution=genomic_coverage_resolution, pair_id=pair_id) outfile = '%s/%s_genomic_coverage_self_circle_%s.bed' % (COVERAGES, pair_id, genomic_coverage_resolution) coverages.to_csv(outfile, sep='\t', index=False)
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() # debug print "uniq_reads =", uniq_reads masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() map(os.remove, glob.glob(out_sam_left_path + '*')) map(os.remove, glob.glob(out_sam_right_path + '*')) map(os.remove, glob.glob(join(output_directory, '*.tsv'))) print 'Done.' stdout.flush()
# 8, but leave some room for you other applications max_reads_per_chunk = chunk, single_end = True) sams1 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r1.txt')] sams2 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r2.txt')] print 'created thes SAM files:', sams2 parse_sam(sams1, sams2, frags, OUTPATH + 'reads1_%s.tsv' % rep, OUTPATH + 'reads2_%s.tsv' % rep, genome_seq, 'HindIII', verbose=True) reads1 = OUTPATH + 'reads1_%s.tsv' % rep reads2 = OUTPATH + 'reads2_%s.tsv' % rep reads = OUTPATH + 'reads12_%s.tsv' % rep get_intersection(reads1, reads2, reads, verbose=True) from pytadbit.mapping.analyze import hic_map hic_map(reads, genome_seq, resolution=100000, savedata='lala') from pytadbit.mapping.analyze import plot_genomic_distribution plot_genomic_distribution(reads, resolution=50000, genome_seq=genome_seq) # because I know it # 690691 SRR_test_10000000/reads1_SRR_test.tsv # 690691 SRR_test_100000/reads1_SRR_test.tsv # 1242927 SRR_test_200000/reads1_SRR_test.tsv # 1035866 SRR_test_500000/reads1_SRR_test.tsv
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() remove(out_sam_left_path + '*') remove(out_sam_right_path + '*') remove(join(output_directory, '*.tsv')) print 'Done.' stdout.flush()
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.pdf' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) if not opts.fast_filter else None) except ValueError: raise ValueError('ERROR: probably all columns filtered out...') # bad columns bad_columns_file = path.join(opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.tsv' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases if not opts.filter_only: print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = cis_trans_N_d = float('nan') if not opts.filter_only: cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True ) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True ) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) if not opts.filter_only: print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % ( opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = 'NA' if not opts.filter_only: out_bias = open(bias_file, 'w') out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # pickle the HiC-data object print 'Saving genomic matrix' pickle_path = path.join(opts.workdir, '04_normalization', 'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash)) out = open(pickle_path, 'w') dump(hic_data, out) out.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: if not opts.filter_only: inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = None genom_map_raw_fig = None else: if not opts.filter_only: genom_map_nrm_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) if not opts.filter_only: genom_map_nrm_txt = path.join(opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join(opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, len(hic_data.bads.keys()), len(hic_data), intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, pickle_path, launch_time, finish_time)