def test_ilmn_bcl2fastq_miseq(self): args = martian.Record({ 'input_mode': "ILMN_BCL2FASTQ", 'sample_def': [{ "gem_group": None, "lanes": None, "read_path": "/mnt/projects/pat/bcl_direct/t1_miseq/p1", "samples": ["9968"] }], 'barcode_whitelist': "737K-april-2014", }) outs = martian.Record({}) if os.path.exists("/mnt/projects/pat/bcl_direct/t1_miseq/p1"): main(args, outs) self.assertTrue(outs.barcodes == [ "/mnt/projects/pat/bcl_direct/t1_miseq/p1/9968_S3_L001_R2_001.fastq.gz" ]) self.assertTrue(outs.barcodes_reverse_complement[0] == False) self.assertTrue(outs.reads_interleaved == False) self.assertTrue(outs.sample_indices == [None]) self.assertTrue(outs.is_read1[0] == True and outs.is_read1[1] == False) self.assertTrue(outs.reads == [ "/mnt/projects/pat/bcl_direct/t1_miseq/p1/9968_S3_L001_R1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t1_miseq/p1/9968_S3_L001_R3_001.fastq.gz" ])
def test_ilmn_bcl2fastq_v184(self): args = martian.Record({ 'input_mode': "ILMN_BCL2FASTQ", 'sample_def': [{ "gem_group": None, "lanes": None, "read_path": "/mnt/projects/pat/bcl_direct/v184_t1/Project_proj1/Sample_a", "samples": ["a"] }], 'barcode_whitelist': "737K-april-2014", }) outs = martian.Record({}) if os.path.exists( "/mnt/projects/pat/bcl_direct/v184_t1/Project_proj1/Sample_a"): main(args, outs) self.assertTrue( outs.barcodes[0] == "/mnt/projects/pat/bcl_direct/v184_t1/Project_proj1/Sample_a/a_CGCTTCAA_L001_R2_001.fastq.gz" ) self.assertTrue(outs.barcodes_reverse_complement[0] == False) self.assertTrue(outs.reads_interleaved == False) self.assertTrue(outs.sample_indices[10] == None) self.assertTrue(outs.is_read1[0] == True and outs.is_read1[1] == False)
def test_align(self): args = { 'chunk_input': IN_FASTQ, 'aligner': 'bwa', 'aligner_method': 'MEM', 'reference_path': 'hg19', '__threads': 1, 'reads_interleaved': True } outs = {'default': OUT_BAM} args = martian.Record(args) outs = martian.Record(outs) main(args, outs) # Ensure each read has a barcode out_bam = pysam.Samfile(OUT_BAM) bam_reads = list(out_bam) fq_file = open(IN_FASTQ) fq_reads = list( tk_fasta.read_generator_fastq(fq_file, paired_end=False)) self.assertEqual(len(bam_reads), len(fq_reads))
def main(self): """Parses command line arguments and runs the stage main.""" # Load args and retvals from metadata. args = martian.Record(self.metadata.read('args')) if self._run_type == 'split': self._run( lambda: self._record_result(lambda: self._module.split(args))) self.metadata.write('stage_defs', self._result) return outs = martian.Record(self.metadata.read('outs')) if self._run_type == 'main': self._run(lambda: self._module.main(args, outs)) elif self._run_type == 'join': chunk_defs = [ martian.Record(chunk_def) for chunk_def in self.metadata.read('chunk_defs') ] chunk_outs = [ martian.Record(chunk_out) for chunk_out in self.metadata.read('chunk_outs') ] self._run( lambda: self._module.join(args, outs, chunk_defs, chunk_outs)) else: martian.throw('Invalid run type %s' % self._run_type) # Write the output as JSON. self.metadata.write('outs', outs.items())
def test_dedup(self): tenkit.constants.DUPLICATE_SUBSAMPLE_COVERAGES = [0.00001, 0.0001] args = martian.Record({ 'input': IN_BAM, 'estimated_coverage': 100.0, 'perfect_read_count': 1000, 'chunk_start':None, 'chunk_end':None }) outs = martian.Record({ 'output': OUT_BAM, 'duplicate_summary': OUT_JSON }) main_mark_duplicates(args, outs) out_bam = pysam.Samfile(OUT_BAM) dups = [ x.is_duplicate for x in out_bam ] self.assertEqual(dups, [ False, True, False, False, True, False ])
def test_attach_bcs(self): # --align_input alignment_output.bam --barcode_input phix_I2.fastq --output test2.out --complete ~/c --stats ~/s args = { 'barcode_whitelist' : IN_WHITELIST, 'align_chunk' : IN_BAM, 'barcode_chunk' : IN_I2, 'sample_index_chunk' : IN_I1, 'gem_group' : None, 'paired_end' : True, 'exclude_non_bc_reads' : False, 'max_expected_bc_error': 0.75, 'subsample_rate' : 1.0, } outs = { 'output': OUT_BAM } args = martian.Record(args) outs = martian.Record(outs) main(args, outs) # Get the barcodes barcode_whitelist = tk_seq.load_barcode_whitelist(IN_WHITELIST) # Ensure each read has a barcode out_bam = pysam.Samfile(OUT_BAM) for r in out_bam: tag_dict = { k:v for (k,v) in r.tags } tag_names = [ k for (k,v) in r.tags ] self.assertTrue(RAW_BARCODE_TAG in tag_names) if tag_dict[RAW_BARCODE_TAG] in barcode_whitelist: self.assertTrue(PROCESSED_BARCODE_TAG in tag_names) self.assertTrue(SAMPLE_INDEX_TAG in tag_names) # Make sure we put out the full BAM file out_len = len([ x for x in pysam.Samfile(OUT_BAM)]) in_len = len([ x for x in pysam.Samfile(IN_BAM)]) self.assertEqual(out_len, in_len) def get_bc(r): tags = { k:v for (k,v) in r.tags } return tags[RAW_BARCODE_TAG] # Ensure each read pair has the same barcode out_bam = pysam.Samfile(OUT_BAM) reads = [ x for x in out_bam ] for (grp, reads) in groupby(reads, lambda x: x.qname): bcs = set(tk_io.get_read_barcode(r) for r in reads) self.assertEqual(len(bcs), 1)
def test_make_unaligned(self): args = martian.Record({ 'sample_id': 1234, 'output_format': "bam", 'read_group': "RG", 'read_chunk': IN_FASTQ }) outs = martian.Record({'barcoded_unaligned': OUT_BAM}) main(args, outs) out_bam = pysam.Samfile(OUT_BAM, check_sq=False) reads = list(out_bam) assert (len(reads) == 2000)
def test_setup_chunks(self): args = martian.Record({ 'input_mode': 'BCL_PROCESSOR', 'sample_def': [{ 'read_path': IN_PREFIX, 'sample_indices': ["AAAA", "CCCC"], 'lanes': None, 'gem_group': None, }], 'barcode_whitelist': "737K-april-2014", }) outs = martian.Record({}) main(args, outs) print outs self.assertTrue(len(outs.chunks) == 3)
def test_attach_phasing(self): args = martian.Record({ 'input': IN_BAM, 'fragment_phasing': IN_FRAGS, 'chunk_start': 0, 'chunk_end': 1 << 32 }) outs = martian.Record({ 'phased_possorted_bam': OUT_BAM, 'phased_possorted_bam_index': OUT_BAM + ".bai" }) main(args, outs) # Ensure each read has a barcode out_bam = pysam.Samfile(OUT_BAM) bam_reads = list(out_bam) ''' chr1 628490 701466 10565419 565419 711255 GTACACAGAGTGTT-1 0.9996837673 0.000316232700235 5.00029071137e-61 chr1 628789 678258 10565419 565419 711255 CGAACTCACTCCAA-1 0.999800468729 0.000199531270515 5.00029083957e-61 chr1 628958 726129 10565419 565419 711255 AGGCTTCATCAGAA-1 3.01287901923e-08 0.999999969871 2.50113486814e-61 chr1 630911 726153 10565419 565419 711255 CTAAGCAGGTTTAG-1 0.998004731897 0.00199526810283 5.00029096742e-61 ''' def check_reads(bc, start, end, haplotype): for r in bam_reads: if tk_io.get_read_barcode(r) != bc: continue tags = {t: v for (t, v) in r.tags} if r.pos >= start and r.pos < end: self.assertTrue(tags.has_key('HP')) self.assertEqual(tags['HP'] == haplotype) check_reads("AGGCTTCATCAGAA-1", 565419, 711255, 1) check_reads("CTAAGCAGGTTTAG-1", 565419, 711255, 0) check_reads("CTAAGCAGGTTTAG-1", 565419, 711255, 1) self.assertEqual(len(bam_reads) > 0, True)
def test_ilmn_bclfastq_mode(self): args = martian.Record({ 'input_mode': "ILMN_BCL2FASTQ", 'sample_def': [{ "gem_group": None, "lanes": None, "read_path": "/mnt/projects/pat/bcl_direct/t2/p1/a", "samples": ["aa"] }, { "gem_group": None, "lanes": None, "read_path": "/mnt/projects/pat/bcl_direct/t2/p2/b", "samples": ["bb"] }], 'barcode_whitelist': "737K-april-2014", }) outs = martian.Record({}) if os.path.exists("/mnt/projects/pat/bcl_direct/t2/p1/a"): main(args, outs) self.assertTrue(outs.barcodes == [ "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L001_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L002_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L003_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L004_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L005_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L006_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L007_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L008_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L001_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L002_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L003_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L004_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L005_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L006_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L007_R2_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L008_R2_001.fastq.gz" ]) self.assertTrue(outs.barcodes_reverse_complement[0] == True) self.assertTrue(outs.reads_interleaved == False) self.assertTrue(outs.sample_indices == [ "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L001_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L002_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L003_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L004_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L005_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L006_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L007_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p1/a/aa_S1_L008_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L001_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L002_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L003_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L004_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L005_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L006_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L007_I1_001.fastq.gz", "/mnt/projects/pat/bcl_direct/t2/p2/b/bb_S2_L008_I1_001.fastq.gz" ]) self.assertTrue(outs.is_read1[0] == True and outs.is_read1[1] == False)
def test_big_dedup(self): tenkit.constants.DUPLICATE_SUBSAMPLE_COVERAGES = [0.000003, 0.000015] args = martian.Record({ 'input': IN_BAM_BIG, 'estimated_coverage':100.0, 'perfect_read_count': 100000, 'chunk_start': None, 'chunk_end': None }) outs = martian.Record({ 'output': OUT_BAM, 'duplicate_summary': OUT_JSON }) main_mark_duplicates(args, outs) out_bam = pysam.Samfile(OUT_BAM) out_reads = list(out_bam) in_bam = pysam.Samfile(IN_BAM_BIG) in_reads = list(in_bam) # Check we haven't lost any reads self.assertEqual(len(out_reads), len(in_reads)) def read_tuple(r): bc = crdna_io.get_read_barcode(r) return (bc, r.tid, r.pos, r.mrnm, r.mpos, r.is_reverse, r.is_read1) #return (bc, r.is_read1, r.is_reverse, r.tid, r.pos, r.mrnm, r.mpos) def mark_duplicates(read_set): # Re-run the dup analysis manually read_tups = [(read_tuple(r), r) for r in read_set] read_tups.sort(key = lambda x: x[0]) groups = itertools.groupby(read_tups, lambda x: x[0]) for (k, reads) in groups: rl = list(reads) rl[0][1].is_duplicate = False for i in range(1, len(rl)): rl[i][1].is_duplicate = True mark_duplicates(in_reads) # Make sure our 'all-reads' analysis matches the code out_dup_marks = np.array([ r.is_duplicate for r in out_reads if (not r.is_unmapped) and (not r.mate_is_unmapped)]) test_dup_marks = np.array([ r.is_duplicate for r in in_reads if (not r.is_unmapped) and (not r.mate_is_unmapped)]) print "len(start_bam): %d -- len(out_bam): %d" % (len(out_dup_marks), len(test_dup_marks)) eq = (out_dup_marks == test_dup_marks).all() print "mean dups code: %f" % out_dup_marks.mean() print "mean dups test: %f" % test_dup_marks.mean() self.assertTrue(eq) # Read the molecule count histogram and verify count_hist = json.load(file(OUT_JSON))['no_filter_full_use_bcs'] dups = sum([ (int(times_observed) - 1) * n for (times_observed, n) in count_hist.items() ]) total_reads = sum([ int(times_observed) * n for (times_observed, n) in count_hist.items() ]) summary_dup_rate = float(dups) / total_reads mapped_in_reads = np.array([r.is_duplicate for r in in_reads if not(r.is_unmapped or r.mate_is_unmapped) and crdna_io.get_read_barcode(r) is not None ]) self.assertEqual(summary_dup_rate, mapped_in_reads.mean()) # Get the perfect reads, mark dups and compare stats perfect_reads = [x for x in in_reads if crdna.read_filter.stringent_read_filter(x, True)] mark_duplicates(perfect_reads) # Read the molecule count histogram and verify -- perfect reads count_hist = json.load(file(OUT_JSON))['full_use_bcs'] dups = sum([ (int(times_observed) - 1) * n for (times_observed, n) in count_hist.items() ]) total_reads = sum([ int(times_observed) * n for (times_observed, n) in count_hist.items() ]) summary_dup_rate = float(dups) / total_reads mapped_in_reads = np.array([r.is_duplicate for r in perfect_reads]) self.assertEqual(summary_dup_rate, mapped_in_reads.mean())