def test_18_filter_reads(self): if ONLY and ONLY != '18': return if CHKTIME: t0 = time() for ali in ['map', 'sam']: seed(1) if 13436 == int(random()*100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta('test.fa~', verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta('test.fa~') # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print 'ERROR: PYSAM not found, skipping test\n' continue parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER masked = filter_reads('lala-%s~' % (ali), verbose=False, fast=(ali=='map')) self.assertEqual(masked[1]['reads'], 1000) self.assertEqual(masked[2]['reads'], 1000) self.assertEqual(masked[3]['reads'], 1000) self.assertEqual(masked[4]['reads'], 1000) if same_seed: self.assertEqual(masked[5]['reads'], 1110) self.assertEqual(masked[6]['reads'], 2332) self.assertEqual(masked[7]['reads'], 0) self.assertEqual(masked[8]['reads'], 141) self.assertEqual(masked[10]['reads'], 1) else: self.assertTrue (masked[5]['reads'] > 1000) self.assertEqual(masked[9]['reads'], 1000) apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1], reverse=True, verbose=False) self.assertEqual(len([True for l in open('lala-map-filt~') if not l.startswith('#')]), 1000) d = plot_iterative_mapping('lala1-map~', 'lala2-map~') self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print '18', time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes( reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print (' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print (' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
def test_18_filter_reads(self): if ONLY and ONLY != "18": return if CHKTIME: t0 = time() for ali in ["map", "sam"]: seed(1) if 13436 == int(random() * 100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta("test.fa~", verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta("test.fa~") # PARSE SAM if ali == "map": from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print "ERROR: PYSAM not found, skipping test\n" continue parser( ["test_read1.%s~" % (ali)], ["test_read2.%s~" % (ali)], "./lala1-%s~" % (ali), "./lala2-%s~" % (ali), genome, re_name="DPNII", mapper="GEM", ) # GET INTERSECTION from pytadbit.mapping import get_intersection get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali), "lala-%s~" % (ali)) # FILTER masked = filter_reads("lala-%s~" % (ali), verbose=False, fast=(ali == "map")) self.assertEqual(masked[1]["reads"], 1000) self.assertEqual(masked[2]["reads"], 1000) self.assertEqual(masked[3]["reads"], 1000) self.assertEqual(masked[4]["reads"], 1000) if same_seed: self.assertEqual(masked[5]["reads"], 1110) self.assertEqual(masked[6]["reads"], 2332) self.assertEqual(masked[7]["reads"], 0) self.assertEqual(masked[8]["reads"], 141) self.assertEqual(masked[10]["reads"], 1) else: self.assertTrue(masked[5]["reads"] > 1000) self.assertEqual(masked[9]["reads"], 1000) apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1], reverse=True, verbose=False) self.assertEqual(len([True for l in open("lala-map-filt~") if not l.startswith("#")]), 1000) d = plot_iterative_mapping("lala1-map~", "lala2-map~") self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print "18", time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = fragment_size(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print( ' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print( ' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) if opts.valid: infile = mreads else: infile = reads bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked, samtools=opts.samtools) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)
def test_18_filter_reads(self): if ONLY and not "18" in ONLY: return if CHKTIME: t0 = time() for ali in ["map", "sam"]: seed(1) if 13436 == int(random() * 100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta("test.fa~", verbose=False, save_cache=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta("test.fa~", save_cache=False) # PARSE SAM if ali == "map": from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print("ERROR: PYSAM not found, skipping test\n") continue parser(["test_read1.%s~" % (ali)], ["test_read2.%s~" % (ali)], "./lala1-%s~" % (ali), "./lala2-%s~" % (ali), genome, re_name="DPNII", mapper="GEM") # GET INTERSECTION from pytadbit.mapping import get_intersection get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali), "lala-%s~" % (ali)) # FILTER masked = filter_reads("lala-%s~" % (ali), verbose=False, fast=(ali == "map")) self.assertEqual(masked[1]["reads"], 1000) self.assertEqual(masked[2]["reads"], 1000) self.assertEqual(masked[3]["reads"], 1000) self.assertEqual(masked[4]["reads"], 1000) if same_seed: self.assertEqual(masked[5]["reads"], 1091) self.assertEqual(masked[6]["reads"], 2230) self.assertEqual(masked[7]["reads"], 0) self.assertEqual(masked[8]["reads"], 100) self.assertEqual(masked[10]["reads"], 5) else: self.assertTrue(masked[5]["reads"] > 1000) self.assertEqual(masked[9]["reads"], 1001) apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1], reverse=True, verbose=False) with open("lala-map-filt~") as f_lala_filt: self.assertEqual( len([True for l in f_lala_filt if not l.startswith("#")]), 1000) d = plot_iterative_mapping("lala1-map~", "lala2-map~") self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print("18", time() - t0)
from pytadbit.parsers.map_parser import parse_map from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping import get_intersection # Load the genome genome_seq = parse_fasta(fasta) # Output directory RESULTS = '%s/results/%s/processed_reads' % (SAMPLE, version) if not os.path.exists(RESULTS): os.makedirs(RESULTS) infiles = [] outfiles = [] for infile in [paired1, paired2]: bname = infile.split("/")[-1].replace(".fastq.gz", "") maps = glob.glob('%s/%s/*' % (MAP_DIR, bname)) infiles.append(maps) outfiles.append('%s/%s_map.tsv' % (RESULTS, bname)) parse_map(infiles[0], infiles[1], outfiles[0], outfiles[1], genome_seq, restriction_enzyme, verbose=True, ncpus=slots) final_output = outfiles[0].replace('read1', 'both') get_intersection(outfiles[0], outfiles[1], final_output, verbose=True)
def tb_parse_mapping_iter(self, genome_seq, enzyme_name, window1_1, window1_2, window1_3, window1_4, window2_1, window2_2, window2_3, window2_4, reads): """ Function to map the aligned reads and return the matching pairs Parameters ---------- genome_seq : dict Object containing the sequence of each of the chromosomes enzyme_name : str Name of the enzyme used to digest the genome window1_1 : str Location of the first window index file window1_2 : str Location of the second window index file window1_3 : str Location of the third window index file window1_4 : str Location of the fourth window index file window2_1 : str Location of the first window index file window2_2 : str Location of the second window index file window2_3 : str Location of the third window index file window2_4 : str Location of the fourth window index file reads : str Location of the reads thats that has a matching location at both ends of the paired reads Returns ------- reads : str Location of the intersection of mapped reads that have matching reads in both pair end files """ reads1 = reads + '_reads_1.tsv' reads2 = reads + '_reads_2.tsv' reads_both = reads + '_reads_both.tsv' parse_map( [window1_1, window1_2, window1_3, window1_4], [window2_1, window2_2, window2_3, window2_4], out_file1=reads1, out_file2=reads2, genome_seq=genome_seq, re_name=enzyme_name, verbose=True, # ncpus=32 ) get_intersection(reads1, reads2, reads_both, verbose=True) with open(reads, "wb") as f_out: with open(reads_both, "rb") as f_in: f_out.write(f_in.read()) return True
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) if opts.fast_fragment: reads = fname1 counts_multis = [ '#' in line.split('\t')[0] for line in open(reads) ] count = len(counts_multis) multiples = {} multiples[1] = sum( [count_mult for count_mult in counts_multis if count_mult]) del counts_multis else: # compute the intersection of the two read ends print('Getting intersection between read 1 and read 2') count, multiples = get_intersection(fname1, fname2, reads, compress=opts.compress_input) # compute insert size print('Get insert size...') hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) try: median, max_f, mad = fragment_size(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) except ZeroDivisionError: warn('WARNING: cannot compute fragment length, too few ' 'dangling-ends. Setting median length to 400 nt.') median, max_f, mad = 400, 100, 1000 print(' - median insert size =', median) print(' - double median absolution of insert size =', mad) print( ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f) max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print(' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends' % max_mole) print(' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks' % min_dist) print("identify pairs to filter...") masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, strict_duplicates=opts.strict_duplicates, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) if opts.valid: infile = mreads else: infile = reads bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked, samtools=opts.samtools) finish_time = time.localtime() print(median, max_f, mad) # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)