def test_read_evidence_variant_matching_gatk_mini_bundle_extract(): handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) loci = [ Locus.from_inclusive_coordinates("20", 10008951), # 0 Locus.from_inclusive_coordinates("20", 10009053), # 1 Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 2 Locus.from_inclusive_coordinates("20", 10006822), # 3 Locus.from_inclusive_coordinates("20", 10006822, 10006823), # 4 ] evidence = PileupCollection.from_bam(handle, loci) eq_(evidence.match_summary(Variant(loci[0], "A", "C")), [('A', 1), ('C', 4)]) eq_( evidence.filter(drop_duplicates=True).match_summary( Variant(loci[0], "A", "C")), [('A', 0), ('C', 3)]) eq_(evidence.match_summary(Variant(loci[1], "A", "C")), [('A', 3), ('C', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), [('A', 3), ('CC', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[2], "AT", "")), [('AT', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[3], "A", "")), [('A', 2), ('', 6)]) eq_(evidence.match_summary(Variant(loci[4], "AC", "")), [('AC', 2), ('', 6)]) eq_( evidence.match_summary( Variant(loci[4], "AC", ""), lambda e: e.read_attributes().mapping_quality.mean()), [('AC', 60.0), ('', 65.0)])
def test_read_evidence_variant_matching_gatk_mini_bundle_extract_warning(): filename = data_path("gatk_mini_bundle_extract.bam") # Should log a warning but pass. loci = [ Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 0 ] evidence = PileupCollection.from_bam(filename, loci) eq_(evidence.match_summary(Variant(loci[0], "A", "")), [('A', 0), ('', 0), ('AT', 3)])
def test_read_evidence_variant_matching_gatk_bundle_native_varcode_variant(): # Try native varcode Variant. handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) locus = Locus.from_inclusive_coordinates("20", 10008951) variant = VarcodeVariant( locus.contig, locus.position + 1, # inclusive not interbase "A", "C") evidence = PileupCollection.from_bam(handle, [variant]) eq_(evidence.match_summary(variant), [('A', 1), ('C', 4)])
def test_read_evidence_variant_matching_gatk_mini_bundle_extract(): handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) loci = [ Locus.from_inclusive_coordinates("20", 10008951), # 0 Locus.from_inclusive_coordinates("20", 10009053), # 1 Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 2 Locus.from_inclusive_coordinates("20", 10006822), # 3 Locus.from_inclusive_coordinates("20", 10006822, 10006823), # 4 ] evidence = PileupCollection.from_bam(handle, loci) eq_(evidence.match_summary(Variant(loci[0], "A", "C")), [('A', 1), ('C', 4)]) eq_(evidence.filter(drop_duplicates=True).match_summary( Variant(loci[0], "A", "C")), [('A', 0), ('C', 3)]) eq_(evidence.match_summary(Variant(loci[1], "A", "C")), [('A', 3), ('C', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), [('A', 3), ('CC', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[2], "AT", "")), [('AT', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[3], "A", "")), [('A', 2), ('', 6)]) eq_(evidence.match_summary(Variant(loci[4], "AC", "")), [('AC', 2), ('', 6)]) eq_(evidence.match_summary( Variant(loci[4], "AC", ""), lambda e: e.read_attributes().mapping_quality.mean()), [('AC', 60.0), ('', 65.0)])
def test_read_evidence_rna1_multi_base_loci(): loci = [ Locus.from_inclusive_coordinates("17", 41244853, 41244854), # 0 Locus.from_inclusive_coordinates("17", 41244853, 41244857), # 1 Locus.from_inclusive_coordinates("17", 41244854, 41244857), # 2 Locus.from_inclusive_coordinates("17", 41244852, 41244857), # 3 Locus.from_inclusive_coordinates("17", 41244933, 41244936), # 4 Locus.from_inclusive_coordinates("17", 41244933, 41244937), # 5 Locus.from_inclusive_coordinates("17", 41244971, 41244973), # 6 Locus.from_inclusive_coordinates("17", 41265063, 41265067), # 7 ] evidence = PileupCollection.from_bam(data_path("rna_chr17_41244936.bam"), loci) eq_(evidence.allele_summary(loci[0]), [("CT", 1)]) eq_(evidence.allele_summary(loci[1]), [("CTTTT", 1)]) eq_(evidence.allele_summary(loci[2]), [("TTTT", 1)]) eq_(evidence.allele_summary(loci[3]), []) eq_(evidence.allele_summary(loci[4]), [("AACA", 11), ("AACG", 6), ("GACG", 1)]) eq_(evidence.allele_summary(loci[5]), [("AACAG", 10), ("AACGG", 6), ("AACAA", 1), ("GACGG", 1)]) eq_(evidence.allele_summary(loci[6]), [("ATT", 11)]) eq_(evidence.allele_summary(loci[7]), [("ACCCG", 1)])
def test_read_evidence_rna1_multi_base_loci(): loci = [ Locus.from_inclusive_coordinates("17", 41244853, 41244854), # 0 Locus.from_inclusive_coordinates("17", 41244853, 41244857), # 1 Locus.from_inclusive_coordinates("17", 41244854, 41244857), # 2 Locus.from_inclusive_coordinates("17", 41244852, 41244857), # 3 Locus.from_inclusive_coordinates("17", 41244933, 41244936), # 4 Locus.from_inclusive_coordinates("17", 41244933, 41244937), # 5 Locus.from_inclusive_coordinates("17", 41244971, 41244973), # 6 Locus.from_inclusive_coordinates("17", 41265063, 41265067), # 7 ] evidence = PileupCollection.from_bam( data_path("rna_chr17_41244936.bam"), loci) eq_(evidence.allele_summary(loci[0]), [("CT", 1)]) eq_(evidence.allele_summary(loci[1]), [("CTTTT", 1)]) eq_(evidence.allele_summary(loci[2]), [("TTTT", 1)]) eq_(evidence.allele_summary(loci[3]), []) eq_(evidence.allele_summary(loci[4]), [("AACA", 11), ("AACG", 6), ("GACG", 1)]) eq_(evidence.allele_summary(loci[5]), [("AACAG", 10), ("AACGG", 6), ("AACAA", 1), ("GACGG", 1)]) eq_(evidence.allele_summary(loci[6]), [("ATT", 11)]) eq_(evidence.allele_summary(loci[7]), [("ACCCG", 1)])
def test_read_evidence_rna1_single_base_loci(): loci = [ Locus.from_inclusive_coordinates("17", 41244936, 41244936), # 0 Locus.from_inclusive_coordinates("17", 41244937, 41244937), # 1 Locus.from_inclusive_coordinates("17", 41244935, 41244935), # 2 Locus.from_inclusive_coordinates("17", 41244933, 41244933), # 3 Locus.from_inclusive_coordinates("17", 41244853, 41244853), # 4 Locus.from_inclusive_coordinates("17", 41244857, 41244857), # 5 Locus.from_inclusive_coordinates("17", 41244864, 41244864), # 6 Locus.from_inclusive_coordinates("17", 41244879, 41244879), # 7 Locus.from_inclusive_coordinates("17", 41244901, 41244901), # 8 Locus.from_inclusive_coordinates("17", 41244910, 41244910), # 9 Locus.from_inclusive_coordinates("17", 41244917, 41244917), # 10 Locus.from_inclusive_coordinates("17", 41244972, 41244972), # 11 Locus.from_inclusive_coordinates("17", 41244973, 41244973), # 12 Locus.from_inclusive_coordinates("17", 41245026, 41245026), # 13 Locus.from_inclusive_coordinates("17", 41245027, 41245027), # 14 Locus.from_inclusive_coordinates("17", 41245019, 41245019), # 15 Locus.from_inclusive_coordinates("17", 41245018, 41245018), # 16 ] evidence = PileupCollection.from_bam(data_path("rna_chr17_41244936.bam"), loci) eq_(evidence.allele_summary(loci[0]), [("A", 11), ("G", 7)]) eq_(evidence.allele_summary(loci[1]), [("G", 17), ("A", 1)]) eq_(evidence.allele_summary(loci[2]), [("C", 18)]) eq_(evidence.allele_summary(loci[3]), [("A", 17), ("G", 1)]) eq_(evidence.allele_summary(loci[4]), [("C", 1)]) eq_(evidence.allele_summary(loci[5]), [("T", 2)]) eq_(evidence.allele_summary(loci[6]), [("T", 4)]) eq_(evidence.allele_summary(loci[7]), [("C", 8)]) eq_(evidence.allele_summary(loci[8]), [("C", 8)]) eq_(evidence.allele_summary(loci[9]), [("C", 9)]) eq_(evidence.allele_summary(loci[10]), [("A", 10)]) eq_(evidence.allele_summary(loci[11]), [("T", 11)]) eq_(evidence.allele_summary(loci[12]), [("T", 11)]) eq_(evidence.allele_summary(loci[13]), [("C", 1)]) eq_(evidence.allele_summary(loci[14]), [("G", 1)]) eq_(evidence.allele_summary(loci[15]), [("T", 8)]) eq_(evidence.allele_summary(loci[16]), [("T", 8)])
def test_read_evidence_gatk_mini_bundle_extract(): loci = [ Locus.from_inclusive_coordinates("20", 9999996, 9999996), # 0 Locus.from_inclusive_coordinates("20", 10260442), # 1 Locus.from_inclusive_coordinates("20", 10006823), # 2 Locus.from_inclusive_coordinates("20", 10006819, 10006823), # 3 Locus.from_inclusive_coordinates("20", 10006819, 10006825), # 4 Locus.from_inclusive_coordinates("20", 10006822, 10006827), # 5 Locus.from_inclusive_coordinates("20", 10007175), # 6 Locus.from_inclusive_coordinates("20", 10007174, 10007176), # 7 Locus.from_inclusive_coordinates("20", 1, 3), # 8 Locus.from_inclusive_coordinates("20", 10008796), # 9 Locus.from_inclusive_coordinates("20", 10008921), # 10 ] handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) evidence = PileupCollection.from_bam(handle, loci) eq_(evidence.allele_summary(loci[0]), [("ACT", 9)]) eq_( evidence.filter(drop_duplicates=True).allele_summary(loci[0]), [("ACT", 8)]) eq_(evidence.allele_summary(loci[1]), [("T", 7)]) eq_(evidence.filter().allele_summary(loci[2]), [("", 6), ("C", 2)]) eq_( evidence.filter(drop_duplicates=True, min_base_quality=50).allele_summary(loci[2]), []) eq_( evidence.filter(drop_duplicates=True).allele_summary(loci[2]), [("", 5), ("C", 1)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=60).allele_summary(loci[2]), [("", 5), ("C", 1)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[2]), [("", 2)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[3]), [("A", 2)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[4]), [("AAA", 2)]) eq_( evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[5]), [("AAAC", 2)]) eq_(evidence.filter().allele_summary(loci[6]), [("T", 5), ("C", 3)]) eq_( evidence.filter(min_base_quality=30).allele_summary(loci[6]), [("T", 4), ("C", 3)]) eq_(evidence.filter().allele_summary(loci[7]), [("CTT", 5), ("CCT", 3)]) eq_( evidence.filter(min_base_quality=30).allele_summary(loci[7]), [("CTT", 3), ("CCT", 2)]) eq_( evidence.filter(min_base_quality=32).allele_summary(loci[2]), [("", 6), ("C", 1)]) eq_(filtered_read_names(evidence.at(loci[2]).filter(min_base_quality=32)), {'20GAVAAXX100126:4:3:18352:43857'}) eq_(evidence.allele_summary(loci[8]), []) eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[8]), []) assert_raises(KeyError, evidence.allele_summary, Locus.from_inclusive_coordinates("20", 10009174, 10009176)) eq_( filtered_read_names( evidence.at(loci[9]).filter(drop_improper_mate_pairs=True)), {'20FUKAAXX100202:8:68:1530:49310'}) eq_(len(evidence.at(loci[8]).read_attribute('mapping_quality')), 0) eq_(list(evidence.at(loci[9]).read_attribute('mapping_quality')), list(evidence.at(loci[9]).read_attributes().mapping_quality)) eq_( evidence.filter(drop_duplicates=True).allele_summary(loci[10]), [('C', 2), ('CA', 1), ('CAA', 1)]) eq_( evidence.filter(drop_duplicates=True).allele_summary( Locus.from_interbase_coordinates(loci[10].contig, loci[10].start, loci[10].start)), [('', 2), ('A', 1), ('AA', 1)])
def test_read_evidence_rna1_single_base_loci(): loci = [ Locus.from_inclusive_coordinates("17", 41244936, 41244936), # 0 Locus.from_inclusive_coordinates("17", 41244937, 41244937), # 1 Locus.from_inclusive_coordinates("17", 41244935, 41244935), # 2 Locus.from_inclusive_coordinates("17", 41244933, 41244933), # 3 Locus.from_inclusive_coordinates("17", 41244853, 41244853), # 4 Locus.from_inclusive_coordinates("17", 41244857, 41244857), # 5 Locus.from_inclusive_coordinates("17", 41244864, 41244864), # 6 Locus.from_inclusive_coordinates("17", 41244879, 41244879), # 7 Locus.from_inclusive_coordinates("17", 41244901, 41244901), # 8 Locus.from_inclusive_coordinates("17", 41244910, 41244910), # 9 Locus.from_inclusive_coordinates("17", 41244917, 41244917), # 10 Locus.from_inclusive_coordinates("17", 41244972, 41244972), # 11 Locus.from_inclusive_coordinates("17", 41244973, 41244973), # 12 Locus.from_inclusive_coordinates("17", 41245026, 41245026), # 13 Locus.from_inclusive_coordinates("17", 41245027, 41245027), # 14 Locus.from_inclusive_coordinates("17", 41245019, 41245019), # 15 Locus.from_inclusive_coordinates("17", 41245018, 41245018), # 16 ] evidence = PileupCollection.from_bam( data_path("rna_chr17_41244936.bam"), loci) eq_(evidence.allele_summary(loci[0]), [("A", 11), ("G", 7)]) eq_(evidence.allele_summary(loci[1]), [("G", 17), ("A", 1)]) eq_(evidence.allele_summary(loci[2]), [("C", 18)]) eq_(evidence.allele_summary(loci[3]), [("A", 17), ("G", 1)]) eq_(evidence.allele_summary(loci[4]), [("C", 1)]) eq_(evidence.allele_summary(loci[5]), [("T", 2)]) eq_(evidence.allele_summary(loci[6]), [("T", 4)]) eq_(evidence.allele_summary(loci[7]), [("C", 8)]) eq_(evidence.allele_summary(loci[8]), [("C", 8)]) eq_(evidence.allele_summary(loci[9]), [("C", 9)]) eq_(evidence.allele_summary(loci[10]), [("A", 10)]) eq_(evidence.allele_summary(loci[11]), [("T", 11)]) eq_(evidence.allele_summary(loci[12]), [("T", 11)]) eq_(evidence.allele_summary(loci[13]), [("C", 1)]) eq_(evidence.allele_summary(loci[14]), [("G", 1)]) eq_(evidence.allele_summary(loci[15]), [("T", 8)]) eq_(evidence.allele_summary(loci[16]), [("T", 8)])
def test_read_evidence_gatk_mini_bundle_extract(): loci = [ Locus.from_inclusive_coordinates("20", 9999996, 9999996), # 0 Locus.from_inclusive_coordinates("20", 10260442), # 1 Locus.from_inclusive_coordinates("20", 10006823), # 2 Locus.from_inclusive_coordinates("20", 10006819, 10006823), # 3 Locus.from_inclusive_coordinates("20", 10006819, 10006825), # 4 Locus.from_inclusive_coordinates("20", 10006822, 10006827), # 5 Locus.from_inclusive_coordinates("20", 10007175), # 6 Locus.from_inclusive_coordinates("20", 10007174, 10007176), # 7 Locus.from_inclusive_coordinates("20", 1, 3), # 8 Locus.from_inclusive_coordinates("20", 10008796), # 9 Locus.from_inclusive_coordinates("20", 10008921), # 10 ] handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) evidence = PileupCollection.from_bam(handle, loci) eq_(evidence.allele_summary(loci[0]), [("ACT", 9)]) eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[0]), [("ACT", 8)]) eq_(evidence.allele_summary(loci[1]), [("T", 7)]) eq_(evidence.filter().allele_summary(loci[2]), [("", 6), ("C", 2)]) eq_(evidence.filter( drop_duplicates=True, min_base_quality=50).allele_summary(loci[2]), []) eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[2]), [("", 5), ("C", 1)]) eq_(evidence.filter( drop_duplicates=True, min_mapping_quality=60).allele_summary( loci[2]), [("", 5), ("C", 1)]) eq_(evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[2]), [("", 2)]) eq_(evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[3]), [("A", 2)]) eq_(evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[4]), [("AAA", 2)]) eq_(evidence.filter(drop_duplicates=True, min_mapping_quality=61).allele_summary(loci[5]), [("AAAC", 2)]) eq_(evidence.filter().allele_summary(loci[6]), [("T", 5), ("C", 3)]) eq_(evidence.filter(min_base_quality=30).allele_summary(loci[6]), [("T", 4), ("C", 3)]) eq_(evidence.filter().allele_summary(loci[7]), [("CTT", 5), ("CCT", 3)]) eq_(evidence.filter(min_base_quality=30).allele_summary(loci[7]), [("CTT", 3), ("CCT", 2)]) eq_(evidence.filter(min_base_quality=32).allele_summary(loci[2]), [("", 6), ("C", 1)]) eq_(filtered_read_names(evidence.at(loci[2]).filter(min_base_quality=32)), {'20GAVAAXX100126:4:3:18352:43857'}) eq_(evidence.allele_summary(loci[8]), []) eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[8]), []) assert_raises(KeyError, evidence.allele_summary, Locus.from_inclusive_coordinates("20", 10009174, 10009176)) eq_(filtered_read_names( evidence.at(loci[9]).filter(drop_improper_mate_pairs=True)), {'20FUKAAXX100202:8:68:1530:49310'}) eq_(len(evidence.at(loci[8]).read_attribute('mapping_quality')), 0) eq_(list(evidence.at(loci[9]).read_attribute('mapping_quality')), list(evidence.at(loci[9]).read_attributes().mapping_quality)) eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[10]), [('C', 2), ('CA', 1), ('CAA', 1)]) eq_(evidence.filter(drop_duplicates=True).allele_summary( Locus.from_interbase_coordinates( loci[10].contig, loci[10].start, loci[10].start)), [('', 2), ('A', 1), ('AA', 1)])