def test_18_filter_reads(self): if CHKTIME: t0 = time() for ali in ['map', 'sam']: seed(1) if 13436 == int(random()*100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta('test.fa~', verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta('test.fa~') # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print 'ERROR: PYSAM not found, skipping test\n' continue parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER masked = filter_reads('lala-%s~' % (ali), verbose=False, fast=(ali=='map')) self.assertEqual(masked[1]['reads'], 1000) self.assertEqual(masked[2]['reads'], 1000) self.assertEqual(masked[3]['reads'], 1000) self.assertEqual(masked[4]['reads'], 1000) if same_seed: self.assertEqual(masked[5]['reads'], 1125) self.assertEqual(masked[6]['reads'], 2328) self.assertEqual(masked[7]['reads'], 0) self.assertEqual(masked[8]['reads'], 94) self.assertEqual(masked[10]['reads'], 1) else: self.assertTrue (masked[5]['reads'] > 1000) self.assertEqual(masked[9]['reads'], 1000) apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1], reverse=True, verbose=False) self.assertEqual(len([True for l in open('lala-map-filt~') if not l.startswith('#')]), 1000) d = plot_iterative_mapping('lala1-map~', 'lala2-map~') self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print '18', time() - t0
def main(): fastq = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq' fastq = 'short_dixon-2012_200bp.fastq' # fastq = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq' gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem' out_map_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/' out_map_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/' temp_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/' temp_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/' print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, windows=((1,100),), add_site=True) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, windows=((101, 200),), add_site=True) # print 'read 1' # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', # temp_dir=temp_dir1, # windows=(zip(*([0] * len(range(25, 105, 5)), # range(25,105,5))))) # print 'read 2' # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', # temp_dir=temp_dir2, # windows=(zip(*([100] * len(range(125, 205, 5)), # range(125,205,5))))) print outfiles1 print 'xcmvnkljnv' print outfiles2 from pytadbit.parsers.map_parser import parse_map from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping.mapper import get_intersection from pytadbit.mapping.filter import filter_reads, apply_filter read1, read2 = 'read1.tsv', 'read2.tsv', parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2, genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'), re_name='HindIII', verbose=True) reads = 'both_reads.tsv' get_intersection(read1, read2, reads) masked = filter_reads(reads) freads = 'filtered_reads.tsv' apply_filter(reads, freads, masked)
} out1.write(read.format(**read1)) out2.write(read.format(**read2)) i += 1 out1.close() out2.close() # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: from pytadbit.parsers.sam_parser import parse_sam as parser parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER from pytadbit.mapping.filter import filter_reads masked = filter_reads('lala-%s~' % (ali))
else: read2 = {'crm': crm1, 'pos': pos1, 'flag': flags[sd1], 'id': 'lala05.1%011d' % (i)} read1 = {'crm': crm2, 'pos': pos2, 'flag': flags[sd2], 'id': 'lala05.1%011d' % (i)} out1.write(read.format(**read1)) out2.write(read.format(**read2)) i += 1 out1.close() out2.close() # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: from pytadbit.parsers.sam_parser import parse_sam as parser parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER from pytadbit.mapping.filter import filter_reads masked = filter_reads('lala-%s~' % (ali))
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() # debug print "uniq_reads =", uniq_reads masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() map(os.remove, glob.glob(out_sam_left_path + '*')) map(os.remove, glob.glob(out_sam_right_path + '*')) map(os.remove, glob.glob(join(output_directory, '*.tsv'))) print 'Done.' stdout.flush()
def test_18_filter_reads(self): if CHKTIME: t0 = time() for ali in ['map', 'sam']: seed(1) if 13436 == int(random() * 100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta('test.fa~', verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta('test.fa~') # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print 'ERROR: PYSAM not found, skipping test\n' continue parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER masked = filter_reads('lala-%s~' % (ali), verbose=False, fast=(ali == 'map')) self.assertEqual(masked[1]['reads'], 1000) self.assertEqual(masked[2]['reads'], 1000) self.assertEqual(masked[3]['reads'], 1000) self.assertEqual(masked[4]['reads'], 1000) if same_seed: self.assertEqual(masked[5]['reads'], 1125) self.assertEqual(masked[6]['reads'], 2328) self.assertEqual(masked[7]['reads'], 0) self.assertEqual(masked[8]['reads'], 94) self.assertEqual(masked[10]['reads'], 1) else: self.assertTrue(masked[5]['reads'] > 1000) self.assertEqual(masked[9]['reads'], 1000) apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1], reverse=True) self.assertEqual( len([ True for l in open('lala-map-filt~') if not l.startswith('#') ]), 1000) d = plot_iterative_mapping('lala1-map~', 'lala2-map~') self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print '18', time() - t0
outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, frag_map=False, windows=(zip(*(r_beg1, r_end1)))) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, frag_map=False, windows=(zip(*(r_beg2, r_end2)))) parse_thing = parse_map elif mapper == 3: print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, windows=(zip(*(r_beg1, r_end1)))) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, windows=(zip(*(r_beg2, r_end2)))) parse_thing = parse_map read1, read2 = 'read1.tsv_%s-%s' % (mapper, win), 'read2.tsv_%s-%s' % (mapper, win) parse_thing(outfiles1, outfiles2, out_file1=read1, out_file2=read2, genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'), re_name='HindIII', verbose=True) reads = 'both_reads.tsv_%s-%s' % (mapper, win) get_intersection(read1, read2, reads) masked = filter_reads(reads) freads = 'filtered_reads.tsv_%s-%s' % (mapper, win) apply_filter(reads, freads, masked)
# 8, but leave some room for you other applications max_reads_per_chunk = chunk, single_end = True) sams1 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r1.txt')] sams2 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r2.txt')] print 'created thes SAM files:', sams2 parse_sam(sams1, sams2, frags, OUTPATH + 'reads1_%s.tsv' % rep, OUTPATH + 'reads2_%s.tsv' % rep, genome_seq, 'HindIII', verbose=True) reads1 = OUTPATH + 'reads1_%s.tsv' % rep reads2 = OUTPATH + 'reads2_%s.tsv' % rep reads = OUTPATH + 'reads12_%s.tsv' % rep get_intersection(reads1, reads2, reads, verbose=True) from pytadbit.mapping.analyze import hic_map hic_map(reads, genome_seq, resolution=100000, savedata='lala') from pytadbit.mapping.analyze import plot_genomic_distribution plot_genomic_distribution(reads, resolution=50000, genome_seq=genome_seq) # because I know it # 690691 SRR_test_10000000/reads1_SRR_test.tsv # 690691 SRR_test_100000/reads1_SRR_test.tsv # 1242927 SRR_test_200000/reads1_SRR_test.tsv # 1035866 SRR_test_500000/reads1_SRR_test.tsv
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() remove(out_sam_left_path + '*') remove(out_sam_right_path + '*') remove(join(output_directory, '*.tsv')) print 'Done.' stdout.flush()
break read1 = {'crm': crm, 'pos': pos1, 'flag': sd1, 'id': 'lala04.%012d' % (i)} read2 = {'crm': crm, 'pos': pos2, 'flag': sd2, 'id': 'lala04.%012d' % (i)} out1.write(read_str.format(**read1)) out2.write(read_str.format(**read2)) # TOO CLOSE FROM RE out1.close() out2.close() # PARSE SAM from pytadbit.parsers.sam_parser import parse_sam parse_sam(['test_read1.sam~'], ['test_read2.sam~'], 'lala1~', 'lala2~', genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1~', 'lala2~', 'lala~') # FILTER from pytadbit.mapping.filter import filter_reads masked = filter_reads('lala~')