Пример #1
0
 def test_duplicate_pairs_no_discard(self):
     # Two identical pairs.  Ensure only one is emitted
     p = test_utils.pair1()
     # use the first read to create the map-reduce key
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))  # add it twice
     self.__reducer.discard_duplicates = False
     self.__reducer.reduce(self.__ctx)
     self.assertEqual(1, len(self.__ctx.emitted.keys()))
     self.assertEqual(
         4, len(self.__ctx.emitted.values()
                [0]))  # four SAM records associated with the same key
     flags = map(lambda sam: int(*re.match("(\d+).*", sam).groups(1)),
                 self.__ctx.emitted.values()[0])
     # ensure we have two marked as duplicates
     self.assertEqual(
         2, len(filter(lambda flag: flag & sam_flags.SAM_FDP, flags)))
     # ensure we have two NOT marked as duplicates
     self.assertEqual(
         2, len(filter(lambda flag: flag & sam_flags.SAM_FDP == 0, flags)))
     # check counter
     if self.__ctx.counters.has_key(self.__frag_counter_name()):
         self.assertEqual(0,
                          self.__ctx.counters[self.__frag_counter_name()])
     self.assertTrue(self.__ctx.counters.has_key(
         self.__pair_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__pair_counter_name()])
Пример #2
0
    def test_duplicate_fragments_read1_no_discard(self):
        # load pair 1 and erase its second read
        p = list(test_utils.pair1())
        p = test_utils.erase_read2(p)
        p0 = p[0]
        # insert the pair into the context, twice
        self.__ctx.add_value(test_utils.make_key(p[0]),
                             proto.serialize_pair(p))
        self.__ctx.add_value(test_utils.make_key(p[0]),
                             proto.serialize_pair(p))
        self.__reducer.discard_duplicates = False
        self.__reducer.reduce(self.__ctx)
        self.assertEqual(1, len(self.__ctx.emitted.keys()))
        self.assertEqual(2,
                         len(self.__ctx.emitted.values()
                             [0]))  # Two SAM records associated with the key
        short_name = p0.get_name()[0:-2]
        self.assertEqual(short_name, self.__ctx.emitted.keys()[0])
        flags = map(lambda sam: int(*re.match("(\d+).*", sam).groups(1)),
                    self.__ctx.emitted.values()[0])
        # ensure we have one marked as duplicate
        self.assertEqual(
            1, len(filter(lambda flag: flag & sam_flags.SAM_FDP, flags)))
        # and ensure we have one NOT marked as duplicates
        self.assertEqual(
            1, len(filter(lambda flag: flag & sam_flags.SAM_FDP == 0, flags)))

        # check counter
        self.assertFalse(
            self.__ctx.counters.has_key(self.__pair_counter_name()))
        self.assertTrue(self.__ctx.counters.has_key(
            self.__frag_counter_name()))
        self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #3
0
    def test_rmdup_bug(self):
        test_case_data = [
            "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t83\t20\t6181935\t60\t5S96M\t=\t6181919\t-112\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t?<?AADADBFBF<EHIGHGGGEAF3AF<CHGGDG9?GHFFACDHH)?@AHEHHIIIIE>A=A:?);B27@;@?>,;;C(5::>>>@5:()4>@@@######\tXC:i:96\tXT:A:U\tNM:i:1\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:1\tXO:i:0\tXG:i:0\tMD:Z:13G82\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:######@@@>4)(:5@>>>::5(C;;,>?@;@72B;)?:A=A>EIIIIHHEHA@?)HHDCAFFHG?9GDGGHC<FA3FAEGGGHGIHE<FBFBDADAA?<?",
            "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181935\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?",
            "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t83\t20\t6181938\t60\t8S93M\t=\t6181919\t-112\tAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACCCCTCTACTGTTACCAAATCCACTCAGATTCCCAAATCTTCCACTT\t@@@DDFDFHHHHHJJJEHGGHIHHAEGHJJIJJFGGHGIDIGIJJ?BBGGGIIIJJIJGFHGIJEC(=3?C;?B9?@C>CECECAA(;;3>C#########\tXC:i:93\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:10G36C45\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:#########C>3;;(AACECEC>C@?9B?;C?3=(CEJIGHFGJIJJIIIGGGBB?JJIGIDIGHGGFJJIJJHGEAHHIHGGHEJJJHHHHHFDFDD@@@",
            "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181938\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA",
        ]
        self.__reducer.discard_duplicates = False
        sams = map(SAMMapping, test_case_data)
        left_key = "0020:000006181919:F"
        pairs = ((sams[1], sams[0]), (sams[3], sams[2]))

        # add the pairs to the context
        self.__ctx.add_value(left_key, proto.serialize_pair(pairs[0]))
        self.__ctx.add_value(left_key, proto.serialize_pair(pairs[1]))
        self.__reducer.reduce(self.__ctx)

        self.assertEqual(2, len(self.__ctx.emitted))
        self.assertEqual([2, 2], map(len, self.__ctx.emitted.itervalues()))
        key = "HWI-ST200R_251:6:2207:18561:163438#GCCAAT"
        good_pair_sam = self.__ctx.emitted[key]
        for read in good_pair_sam:
            mapping = SAMMapping("\t".join((key, read)))
            self.assertFalse(mapping.is_duplicate())

        key = "HWI-ST200R_251:5:1208:19924:124635#GCCAAT"
        dup_pair_sam = self.__ctx.emitted[key]
        for read in dup_pair_sam:
            mapping = SAMMapping("\t".join((key, read)))
            self.assertTrue(mapping.is_duplicate())
Пример #4
0
    def test_fragment_with_duplicate_in_pair_1_no_discard(self):
        # Ensure the reducer catches a fragment duplicate of pair[0]
        p = list(test_utils.pair1())
        self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
        p = test_utils.erase_read2(p)
        self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
        self.__reducer.discard_duplicates = False
        self.__reducer.reduce(self.__ctx)
        # now ensure that both were emitted, but the fragment is marked as duplicate
        self.__ensure_pair1_emitted()
        self.assertEqual(1, len(self.__ctx.emitted.keys()))
        self.assertEqual(3, len(self.__ctx.emitted.values()[0])) # 3 SAM records associated with the key (for the pair)

        # make sure we have a read with the duplicate flag set
        regexp = "(\d+)\s+.*"
        flags = [ int(re.match(regexp, value).group(1)) for value in self.__ctx.emitted.values()[0] ]
        dup_flags = [ flag for flag in flags if flag & sam_flags.SAM_FDP ]
        self.assertEqual(1, len(dup_flags))
        f = dup_flags[0]
        self.assertTrue( f & sam_flags.SAM_FR1 > 0 ) # ensure the duplicate read is r1
        self.assertTrue( f & sam_flags.SAM_FPD == 0 ) # ensure the duplicate read is unpaired

        # check counter
        self.assertFalse(self.__ctx.counters.has_key(self.__pair_counter_name()))
        self.assertTrue(self.__ctx.counters.has_key(self.__frag_counter_name()))
        self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #5
0
    def test_rmdup_bug(self):
        test_case_data = [
"HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t83\t20\t6181935\t60\t5S96M\t=\t6181919\t-112\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t?<?AADADBFBF<EHIGHGGGEAF3AF<CHGGDG9?GHFFACDHH)?@AHEHHIIIIE>A=A:?);B27@;@?>,;;C(5::>>>@5:()4>@@@######\tXC:i:96\tXT:A:U\tNM:i:1\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:1\tXO:i:0\tXG:i:0\tMD:Z:13G82\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:######@@@>4)(:5@>>>::5(C;;,>?@;@72B;)?:A=A>EIIIIHHEHA@?)HHDCAFFHG?9GDGGHC<FA3FAEGGGHGIHE<FBFBDADAA?<?",
"HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181935\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@@@FFFDDFHG??;EEH>HHGIGHEGCGEGGIGJG31?DDBBD>FGG@HG??DFBBADFAGII3@EH;;CEHECBB7?>CE.;...5>ACDDA:C:;>:>?",
"HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t83\t20\t6181938\t60\t8S93M\t=\t6181919\t-112\tAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACCCCTCTACTGTTACCAAATCCACTCAGATTCCCAAATCTTCCACTT\t@@@DDFDFHHHHHJJJEHGGHIHHAEGHJJIJJFGGHGIDIGIJJ?BBGGGIIIJJIJGFHGIJEC(=3?C;?B9?@C>CECECAA(;;3>C#########\tXC:i:93\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:10G36C45\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:#########C>3;;(AACECEC>C@?9B?;C?3=(CEJIGHFGJIJJIIIGGGBB?JJIGIDIGHGGFJJIJJHGEAHHIHGGHEJJJHHHHHFDFDD@@@",
"HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t163\t20\t6181919\t60\t101M\t=\t6181938\t112\tCTGAGCACACCAAAATTCATCTACCACCTATATTGGTACCTCGATTGTTTTCCAAGCCAGATCCACACCTCTACTGTTACCAAATCCACTCAGATTCCCAA\t@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA\tXT:A:U\tNM:i:2\tSM:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tAM:i:37\tX0:i:1\tX1:i:0\tXM:i:2\tXO:i:0\tXG:i:0\tMD:Z:29G36C34\tRG:Z:raw_merged-1.2.3.4.5.6.7.8.bam\tOQ:Z:@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA",
        ]
        self.__reducer.discard_duplicates = False
        sams = map(SAMMapping, test_case_data)
        left_key = "0020:000006181919:F"
        pairs = ( (sams[1], sams[0]), (sams[3], sams[2]) )

        # add the pairs to the context
        self.__ctx.add_value(left_key, proto.serialize_pair(pairs[0]))
        self.__ctx.add_value(left_key, proto.serialize_pair(pairs[1]))
        self.__reducer.reduce(self.__ctx)

        self.assertEqual(2, len(self.__ctx.emitted))
        self.assertEqual([2,2], map(len, self.__ctx.emitted.itervalues()))
        key = "HWI-ST200R_251:6:2207:18561:163438#GCCAAT"
        good_pair_sam = self.__ctx.emitted[key]
        for read in good_pair_sam:
            mapping = SAMMapping("\t".join( (key,read) ))
            self.assertFalse(mapping.is_duplicate())

        key = "HWI-ST200R_251:5:1208:19924:124635#GCCAAT"
        dup_pair_sam = self.__ctx.emitted[key]
        for read in dup_pair_sam:
            mapping = SAMMapping("\t".join( (key,read) ))
            self.assertTrue(mapping.is_duplicate())
Пример #6
0
    def test_rmdup_clipped_unpaired(self):
        test_case_data = [
"HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t82\t20\t6181935\t60\t5S96M\t*\t0\t0\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t?<?AADADBFBF<EHIGHGGGEAF3AF<CHGGDG9?GHFFACDHH)?@AHEHHIIIIE>A=A:?);B27@;@?>,;;C(5::>>>@5:()4>@@@######\tXC:i:96",
"HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t82\t20\t6181930\t60\t101M\t*\t0\t0\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA",
        ]
        self.__reducer.discard_duplicates = False
        sams = map(SAMMapping, test_case_data)
        left_key = "0020:000006181930:R"
        pairs = ( (sams[0], None), (sams[1], None) )

        # add the pairs to the context
        self.__ctx.add_value(left_key, proto.serialize_pair(pairs[0]))
        self.__ctx.add_value(left_key, proto.serialize_pair(pairs[1]))
        self.__reducer.reduce(self.__ctx)

        self.assertEqual(2, len(self.__ctx.emitted))
        self.assertEqual([1,1], map(len, self.__ctx.emitted.itervalues()))

        key = "HWI-ST200R_251:6:2207:18561:163438#GCCAAT"
        good_read_sam = self.__ctx.emitted[key][0]
        mapping = SAMMapping("\t".join( (key,good_read_sam) ))
        self.assertFalse(mapping.is_duplicate())

        key = "HWI-ST200R_251:5:1208:19924:124635#GCCAAT"
        dup_read_sam = self.__ctx.emitted[key][0]
        mapping = SAMMapping("\t".join( (key,dup_read_sam) ))
        self.assertTrue(mapping.is_duplicate())
Пример #7
0
    def test_rmdup_clipped_unpaired(self):
        test_case_data = [
            "HWI-ST200R_251:5:1208:19924:124635#GCCAAT\t82\t20\t6181935\t60\t5S96M\t*\t0\t0\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t?<?AADADBFBF<EHIGHGGGEAF3AF<CHGGDG9?GHFFACDHH)?@AHEHHIIIIE>A=A:?);B27@;@?>,;;C(5::>>>@5:()4>@@@######\tXC:i:96",
            "HWI-ST200R_251:6:2207:18561:163438#GCCAAT\t82\t20\t6181930\t60\t101M\t*\t0\t0\tAAGTGGAAGATTTGGGAATCTGAGTGGATTTGGTAACAGTAGAGGGGTGGATCTGGCTTGGAAAACAATCGAGGTACCAATATAGGTGGTAGATGAATTTT\t@CCFFDDFHHHHHIJJJIIJJJIJJIIJGJIIIJII?DGHIGHDGHIIIJIJIJIIDCHGIJIIGGHIFEHHHHFFFFFDC.6.66;@CCCDCCDC>CCCA",
        ]
        self.__reducer.discard_duplicates = False
        sams = map(SAMMapping, test_case_data)
        left_key = "0020:000006181930:R"
        pairs = ((sams[0], None), (sams[1], None))

        # add the pairs to the context
        self.__ctx.add_value(left_key, proto.serialize_pair(pairs[0]))
        self.__ctx.add_value(left_key, proto.serialize_pair(pairs[1]))
        self.__reducer.reduce(self.__ctx)

        self.assertEqual(2, len(self.__ctx.emitted))
        self.assertEqual([1, 1], map(len, self.__ctx.emitted.itervalues()))

        key = "HWI-ST200R_251:6:2207:18561:163438#GCCAAT"
        good_read_sam = self.__ctx.emitted[key][0]
        mapping = SAMMapping("\t".join((key, good_read_sam)))
        self.assertFalse(mapping.is_duplicate())

        key = "HWI-ST200R_251:5:1208:19924:124635#GCCAAT"
        dup_read_sam = self.__ctx.emitted[key][0]
        mapping = SAMMapping("\t".join((key, dup_read_sam)))
        self.assertTrue(mapping.is_duplicate())
Пример #8
0
 def test_duplicate_fragments_read1(self):
     # load pair 1
     p = list(test_utils.pair1())
     p = test_utils.erase_read2(p)
     p0 = p[0]
     # insert the pair into the context, twice
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))
     self.__reducer.reduce(self.__ctx)
     self.assertEqual(1, len(self.__ctx.emitted.keys()))
     self.assertEqual(
         1, len(self.__ctx.emitted.values()
                [0]))  # only one SAM record associated with the key
     short_name = p0.get_name()[0:-2]
     self.assertEqual(short_name, self.__ctx.emitted.keys()[0])
     self.assertTrue(
         re.match("\d+\s+%s\s+%d\s+.*" % (p0.tid, p0.pos),
                  self.__ctx.emitted[short_name][0]))
     # check counter
     self.assertFalse(
         self.__ctx.counters.has_key(self.__pair_counter_name()))
     self.assertTrue(self.__ctx.counters.has_key(
         self.__frag_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #9
0
 def test_fragment_with_duplicate_in_pair_1(self):
     # Ensure the reducer catches a fragment duplicate of pair[0]
     p = list(test_utils.pair1())
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
     test_utils.erase_read2(p)
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
     self.__reducer.reduce(self.__ctx)
     # now ensure that the pair was emitted, but not the fragment
     self.__ensure_only_pair1_emitted()
     self.assertEqual(1, len(self.__ctx.emitted.keys()))
     self.assertEqual(2, len(self.__ctx.emitted.values()[0])) # two SAM records associated with the key (for the pair)
     # check counter
     self.assertFalse(self.__ctx.counters.has_key(self.__pair_counter_name()))
     self.assertTrue(self.__ctx.counters.has_key(self.__frag_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #10
0
 def test_duplicate_pairs(self):
     # Two identical pairs.  Ensure only one is emitted
     p = test_utils.pair1()
     # use the first read to create the map-reduce key
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p)) # add it twice
     self.__reducer.reduce(self.__ctx)
     self.assertEqual(1, len(self.__ctx.emitted.keys()))
     self.assertEqual(2, len(self.__ctx.emitted.values()[0])) # two SAM records associated with the same key
     self.__ensure_only_pair1_emitted()
     # check counter
     if self.__ctx.counters.has_key(self.__frag_counter_name()):
         self.assertEqual(0, self.__ctx.counters[self.__frag_counter_name()])
     self.assertTrue(self.__ctx.counters.has_key(self.__pair_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__pair_counter_name()])
Пример #11
0
    def process(self, pair):
        if any(pair):
            # order pair such that left-most read is at pos 0
            ordered_pair = self.__order_pair(pair)

            record = protobuf_mapping.serialize_pair(ordered_pair)
            # emit with the left coord
            key = self.get_hit_key(ordered_pair[0])
            self.ctx.emit(key, record)
            if ordered_pair[0].is_mapped():
                self.event_monitor.count("mapped coordinates", 1)
                # since we ordered the pair, if ordered_pair[0] is unmapped
                # ordered_pair[1] will not be mapped.
                if ordered_pair[1]:
                    if ordered_pair[1].is_mapped():
                        # a full pair. We emit the coordinate, but with PAIR_STRING as the value
                        key = self.get_hit_key(ordered_pair[1])
                        self.ctx.emit(key, seqal_app.PAIR_STRING)
                        self.event_monitor.count("mapped coordinates", 1)
                    else:
                        self.event_monitor.count("unmapped reads", 1)
            else:
                self.event_monitor.count("unmapped reads", len(pair))

        # in all cases, forward the original pair to the link in the chain
        if self.next_link:
            self.next_link.process(pair)
Пример #12
0
    def process(self, pair):
        if any(pair):
            # order pair such that left-most read is at pos 0
            ordered_pair = self.__order_pair(pair)

            record = protobuf_mapping.serialize_pair(ordered_pair)
            # emit with the left coord
            key = self.get_hit_key(ordered_pair[0])
            self.ctx.emit(key, record)
            if ordered_pair[0].is_mapped():
                self.event_monitor.count("mapped coordinates", 1)
                # since we ordered the pair, if ordered_pair[0] is unmapped
                # ordered_pair[1] will not be mapped.
                if ordered_pair[1]:
                    if ordered_pair[1].is_mapped():
                        # a full pair. We emit the coordinate, but with PAIR_STRING as the value
                        key = self.get_hit_key(ordered_pair[1])
                        self.ctx.emit(key, seqal_app.PAIR_STRING)
                        self.event_monitor.count("mapped coordinates", 1)
                    else:
                        self.event_monitor.count("unmapped reads", 1)
            else:
                self.event_monitor.count("unmapped reads", len(pair))

        # in all cases, forward the original pair to the link in the chain
        if self.next_link:
            self.next_link.process(pair)
Пример #13
0
 def test_unmapped2(self):
     p = test_utils.pair1()
     p[1].set_mapped(False)
     p[0].set_mate_mapped(False)
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
     self.__reducer.reduce(self.__ctx)
     self.assertEqual(2, len(self.__ctx.emitted.values()[0]))
Пример #14
0
 def test_emit_on_left_key(self):
     # load pair 1
     p = test_utils.pair1()
     # use the first read to create the map-reduce key
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
     self.__reducer.reduce(self.__ctx)
     self.__ensure_only_pair1_emitted()
Пример #15
0
 def test_unmapped2(self):
     p = test_utils.pair1()
     p[1].set_mapped(False)
     p[0].set_mate_mapped(False)
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))
     self.__reducer.reduce(self.__ctx)
     self.assertEqual(2, len(self.__ctx.emitted.values()[0]))
Пример #16
0
    def test_fw_rev_missed_dup_pair(self):
        """
		Here we have two duplicate pairs, as detected by Picard.
		The reads 10364 have flags 'pPR1', 'pPr2'.
		The reads 138222 have flags 'pPR2d', 'pPr1d'.
		10364/1 has no trimming and is on the fw strand.  It should have a key position 88888399:F.
		10364/2 is also on the rev strand.  We expect a key position of 88888421+53 = 88888474:R.
		138222/1 is on the rev strand.  We expect a key position of 88888404+70 = 88888474:R.
		138222/2 has been trimmed but is on the fw strand. It should have a key position 88888399:F.
		"""
        test_case_data = [
            "HWI-ST332_97:3:66:16214:10364#0\t99\t10\t88888399\t46\t76M\t=\t88888421\t76\tTGATTTTGCTCCATTGTCTTCTAGCTTGTGTTATGCCTGTTGAAAGTACAAAATCATTCTGGAAGCTTATCTATTG\tHGHHHHHHFHEHHHHGHHFHHHGGHHHHHHHHHHEHDHGEHFHHHHHHHGGHHHHHHHHHHBHBBEGG=GFFFF@F",
            "HWI-ST332_97:3:66:16214:10364#0\t147\t10\t88888421\t46\t22S54M\t=\t88888399\t-76\tTGATTCCGCTCCATGTGCCTCGAGCTTGTGTTATGCCTGTTGAAAGTACAAAATCATTCTGGAAGCTTATCTATTG\t#######################AA@EHGHEHHHHHHHHHHHHHHHEHHHHHHFHFHHHHHHHHHHHHHHHGHHFH",
            "HWI-ST332_97:3:7:10556:138222#0\t1107\t10\t88888404\t23\t5S71M\t=\t88888399\t-76\tTGATGTTGCTCCATTGTCTTCTAGCTTGTGTTATGCCTGTTGAAAGTACAAAATCATTCTGGAAGCTTATCTATTG\t######C<BCC:B,DDDC=BD3@CB8B?DBD@E@EEEEECED@CDB=8C7A@D=DEDEDCDBECDE<>;;,17,45",
            "HWI-ST332_97:3:7:10556:138222#0\t1187\t10\t88888399\t15\t50M26S\t=\t88888404\t76\tTGATTTTGCTCCATTGTCTTCTAGCTTGTGTTATGCCTGTTGAAAGTACAAAATCCGTCTGGTTGCTTCTATTTTG\tCC7EEFFF@FHHFHHG?GGF:>4.7GD8DC@D>CCFGG@GGG5GG4<CB###########################",
        ]
        sams = map(SAMMapping, test_case_data)
        pair1 = sams[0:2]
        pair2 = sams[2:]

        left_key = "0010:000088888399:F"
        self.__ctx.add_value(left_key, proto.serialize_pair(pair1))
        self.__ctx.add_value(left_key, proto.serialize_pair(pair2))
        self.__reducer.reduce(self.__ctx)

        # verify emitted data.  2 keys (one per read name) and 2 reads for each key
        self.assertEqual(2, len(self.__ctx.emitted))
        self.assertEqual([2, 2], map(len, self.__ctx.emitted.itervalues()))
        key = "HWI-ST332_97:3:66:16214:10364#0"
        good_pair_sam = self.__ctx.emitted[key]
        for read in good_pair_sam:
            mapping = SAMMapping("\t".join((key, read)))
            self.assertFalse(mapping.is_duplicate())
        key = "HWI-ST332_97:3:7:10556:138222#0"
        dup_pair_sam = self.__ctx.emitted[key]
        for read in dup_pair_sam:
            mapping = SAMMapping("\t".join((key, read)))
            self.assertTrue(mapping.is_duplicate())

        self.setUp()  # clean-up and repeat for the right key
        right_key = "0010:000088888474:R"
        self.__ctx.add_value(right_key, PAIR_STRING)
        self.__ctx.add_value(right_key, PAIR_STRING)
        self.__reducer.reduce(self.__ctx)
        # verify no data emitted
        self.assertEqual(0, len(self.__ctx.emitted))
Пример #17
0
    def test_fw_rev_missed_dup_pair(self):
        """
        Here we have two duplicate pairs, as detected by Picard.
        The reads 10364 have flags 'pPR1', 'pPr2'.
        The reads 138222 have flags 'pPR2d', 'pPr1d'.
        10364/1 has no trimming and is on the fw strand.  It should have a key position 88888399:F.
        10364/2 is also on the rev strand.  We expect a key position of 88888421+53 = 88888474:R.
        138222/1 is on the rev strand.  We expect a key position of 88888404+70 = 88888474:R.
        138222/2 has been trimmed but is on the fw strand. It should have a key position 88888399:F.
        """
        test_case_data = [
            "HWI-ST332_97:3:66:16214:10364#0\t99\t10\t88888399\t46\t76M\t=\t88888421\t76\tTGATTTTGCTCCATTGTCTTCTAGCTTGTGTTATGCCTGTTGAAAGTACAAAATCATTCTGGAAGCTTATCTATTG\tHGHHHHHHFHEHHHHGHHFHHHGGHHHHHHHHHHEHDHGEHFHHHHHHHGGHHHHHHHHHHBHBBEGG=GFFFF@F",
            "HWI-ST332_97:3:66:16214:10364#0\t147\t10\t88888421\t46\t22S54M\t=\t88888399\t-76\tTGATTCCGCTCCATGTGCCTCGAGCTTGTGTTATGCCTGTTGAAAGTACAAAATCATTCTGGAAGCTTATCTATTG\t#######################AA@EHGHEHHHHHHHHHHHHHHHEHHHHHHFHFHHHHHHHHHHHHHHHGHHFH",
            "HWI-ST332_97:3:7:10556:138222#0\t1107\t10\t88888404\t23\t5S71M\t=\t88888399\t-76\tTGATGTTGCTCCATTGTCTTCTAGCTTGTGTTATGCCTGTTGAAAGTACAAAATCATTCTGGAAGCTTATCTATTG\t######C<BCC:B,DDDC=BD3@CB8B?DBD@E@EEEEECED@CDB=8C7A@D=DEDEDCDBECDE<>;;,17,45",
            "HWI-ST332_97:3:7:10556:138222#0\t1187\t10\t88888399\t15\t50M26S\t=\t88888404\t76\tTGATTTTGCTCCATTGTCTTCTAGCTTGTGTTATGCCTGTTGAAAGTACAAAATCCGTCTGGTTGCTTCTATTTTG\tCC7EEFFF@FHHFHHG?GGF:>4.7GD8DC@D>CCFGG@GGG5GG4<CB###########################",
        ]
        sams = map(SAMMapping, test_case_data)
        pair1 = sams[0:2]
        pair2 = sams[2:]

        left_key = "0010:000088888399:F"
        self.__ctx.add_value(left_key, proto.serialize_pair(pair1))
        self.__ctx.add_value(left_key, proto.serialize_pair(pair2))
        self.__reducer.reduce(self.__ctx)

        # verify emitted data.  2 keys (one per read name) and 2 reads for each key
        self.assertEqual(2, len(self.__ctx.emitted))
        self.assertEqual([2,2], map(len, self.__ctx.emitted.itervalues()))
        key = "HWI-ST332_97:3:66:16214:10364#0"
        good_pair_sam = self.__ctx.emitted[key]
        for read in good_pair_sam:
            mapping = SAMMapping("\t".join( (key,read) ))
            self.assertFalse(mapping.is_duplicate())
        key = "HWI-ST332_97:3:7:10556:138222#0"
        dup_pair_sam = self.__ctx.emitted[key]
        for read in dup_pair_sam:
            mapping = SAMMapping("\t".join( (key,read) ))
            self.assertTrue(mapping.is_duplicate())

        self.setUp() # clean-up and repeat for the right key
        right_key = "0010:000088888474:R"
        self.__ctx.add_value(right_key, PAIR_STRING)
        self.__ctx.add_value(right_key, PAIR_STRING)
        self.__reducer.reduce(self.__ctx)
        # verify no data emitted
        self.assertEqual(0, len(self.__ctx.emitted))
Пример #18
0
 def test_emit_on_left_key(self):
     # load pair 1
     p = test_utils.pair1()
     # use the first read to create the map-reduce key
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))
     self.__reducer.reduce(self.__ctx)
     self.__ensure_only_pair1_emitted()
Пример #19
0
 def test_unmapped1(self):
     p = test_utils.pair1()
     p[0].set_mapped(False)
     p[1].set_mate_mapped(False)
     # Having an unmapped read before a mapped read is not allowed.  This should
     # raise an exception
     # The key is meaningless
     self.__ctx.add_value(test_utils.make_key(p[1]), proto.serialize_pair(p))
     self.assertRaises(ValueError, self.__reducer.reduce, self.__ctx)
Пример #20
0
 def test_duplicate_fragments_read1(self):
     # load pair 1
     p = list(test_utils.pair1())
     p = test_utils.erase_read2(p)
     p0 = p[0]
     # insert the pair into the context, twice
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
     self.__reducer.reduce(self.__ctx)
     self.assertEqual(1, len(self.__ctx.emitted.keys()))
     self.assertEqual(1, len(self.__ctx.emitted.values()[0])) # only one SAM record associated with the key
     short_name = p0.get_name()[0:-2]
     self.assertEqual(short_name, self.__ctx.emitted.keys()[0])
     self.assertTrue( re.match("\d+\s+%s\s+%d\s+.*" % (p0.tid, p0.pos), self.__ctx.emitted[short_name][0]) )
     # check counter
     self.assertFalse(self.__ctx.counters.has_key(self.__pair_counter_name()))
     self.assertTrue(self.__ctx.counters.has_key(self.__frag_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #21
0
 def test_unmapped1(self):
     p = test_utils.pair1()
     p[0].set_mapped(False)
     p[1].set_mate_mapped(False)
     # Having an unmapped read before a mapped read is not allowed.  This should
     # raise an exception
     # The key is meaningless
     self.__ctx.add_value(test_utils.make_key(p[1]),
                          proto.serialize_pair(p))
     self.assertRaises(ValueError, self.__reducer.reduce, self.__ctx)
Пример #22
0
 def test_duplicate_pairs_no_discard(self):
     # Two identical pairs.  Ensure only one is emitted
     p = test_utils.pair1()
     # use the first read to create the map-reduce key
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
     self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p)) # add it twice
     self.__reducer.discard_duplicates = False
     self.__reducer.reduce(self.__ctx)
     self.assertEqual(1, len(self.__ctx.emitted.keys()))
     self.assertEqual(4, len(self.__ctx.emitted.values()[0])) # four SAM records associated with the same key
     flags = map(lambda sam: int(*re.match("(\d+).*", sam).groups(1)), self.__ctx.emitted.values()[0])
     # ensure we have two marked as duplicates
     self.assertEqual(2, len(filter(lambda flag: flag & sam_flags.SAM_FDP, flags)) )
     # ensure we have two NOT marked as duplicates
     self.assertEqual(2, len(filter(lambda flag: flag & sam_flags.SAM_FDP == 0, flags)) )
     # check counter
     if self.__ctx.counters.has_key(self.__frag_counter_name()):
         self.assertEqual(0, self.__ctx.counters[self.__frag_counter_name()])
     self.assertTrue(self.__ctx.counters.has_key(self.__pair_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__pair_counter_name()])
Пример #23
0
 def test_fragment_with_duplicate_in_pair_1(self):
     # Ensure the reducer catches a fragment duplicate of pair[0]
     p = list(test_utils.pair1())
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))
     test_utils.erase_read2(p)
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))
     self.__reducer.reduce(self.__ctx)
     # now ensure that the pair was emitted, but not the fragment
     self.__ensure_only_pair1_emitted()
     self.assertEqual(1, len(self.__ctx.emitted.keys()))
     self.assertEqual(2, len(
         self.__ctx.emitted.values()
         [0]))  # two SAM records associated with the key (for the pair)
     # check counter
     self.assertFalse(
         self.__ctx.counters.has_key(self.__pair_counter_name()))
     self.assertTrue(self.__ctx.counters.has_key(
         self.__frag_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #24
0
 def test_duplicate_pairs(self):
     # Two identical pairs.  Ensure only one is emitted
     p = test_utils.pair1()
     # use the first read to create the map-reduce key
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))
     self.__ctx.add_value(test_utils.make_key(p[0]),
                          proto.serialize_pair(p))  # add it twice
     self.__reducer.reduce(self.__ctx)
     self.assertEqual(1, len(self.__ctx.emitted.keys()))
     self.assertEqual(
         2, len(self.__ctx.emitted.values()
                [0]))  # two SAM records associated with the same key
     self.__ensure_only_pair1_emitted()
     # check counter
     if self.__ctx.counters.has_key(self.__frag_counter_name()):
         self.assertEqual(0,
                          self.__ctx.counters[self.__frag_counter_name()])
     self.assertTrue(self.__ctx.counters.has_key(
         self.__pair_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__pair_counter_name()])
Пример #25
0
    def test_fragment_with_duplicate_in_pair_1_no_discard(self):
        # Ensure the reducer catches a fragment duplicate of pair[0]
        p = list(test_utils.pair1())
        self.__ctx.add_value(test_utils.make_key(p[0]),
                             proto.serialize_pair(p))
        p = test_utils.erase_read2(p)
        self.__ctx.add_value(test_utils.make_key(p[0]),
                             proto.serialize_pair(p))
        self.__reducer.discard_duplicates = False
        self.__reducer.reduce(self.__ctx)
        # now ensure that both were emitted, but the fragment is marked as duplicate
        self.__ensure_pair1_emitted()
        self.assertEqual(1, len(self.__ctx.emitted.keys()))
        self.assertEqual(3, len(
            self.__ctx.emitted.values()
            [0]))  # 3 SAM records associated with the key (for the pair)

        # make sure we have a read with the duplicate flag set
        regexp = "(\d+)\s+.*"
        flags = [
            int(re.match(regexp, value).group(1))
            for value in self.__ctx.emitted.values()[0]
        ]
        dup_flags = [flag for flag in flags if flag & sam_flags.SAM_FDP]
        self.assertEqual(1, len(dup_flags))
        f = dup_flags[0]
        self.assertTrue(
            f & sam_flags.SAM_FR1 > 0)  # ensure the duplicate read is r1
        self.assertTrue(
            f
            & sam_flags.SAM_FPD == 0)  # ensure the duplicate read is unpaired

        # check counter
        self.assertFalse(
            self.__ctx.counters.has_key(self.__pair_counter_name()))
        self.assertTrue(self.__ctx.counters.has_key(
            self.__frag_counter_name()))
        self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #26
0
    def test_duplicate_fragments_read1_no_discard(self):
        # load pair 1 and erase its second read
        p = list(test_utils.pair1())
        p = test_utils.erase_read2(p)
        p0 = p[0]
        # insert the pair into the context, twice
        self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
        self.__ctx.add_value(test_utils.make_key(p[0]), proto.serialize_pair(p))
        self.__reducer.discard_duplicates = False
        self.__reducer.reduce(self.__ctx)
        self.assertEqual(1, len(self.__ctx.emitted.keys()))
        self.assertEqual(2, len(self.__ctx.emitted.values()[0])) # Two SAM records associated with the key
        short_name = p0.get_name()[0:-2]
        self.assertEqual(short_name, self.__ctx.emitted.keys()[0])
        flags = map(lambda sam: int(*re.match("(\d+).*", sam).groups(1)), self.__ctx.emitted.values()[0])
        # ensure we have one marked as duplicate
        self.assertEqual(1, len(filter(lambda flag: flag & sam_flags.SAM_FDP, flags)) )
        # and ensure we have one NOT marked as duplicates
        self.assertEqual(1, len(filter(lambda flag: flag & sam_flags.SAM_FDP == 0, flags)) )

        # check counter
        self.assertFalse(self.__ctx.counters.has_key(self.__pair_counter_name()))
        self.assertTrue(self.__ctx.counters.has_key(self.__frag_counter_name()))
        self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #27
0
 def test_fragment_with_duplicate_in_pair_2(self):
     # Ensure the reducer catches a fragment duplicate of pair[1].
     p = list(test_utils.pair1())
     # Insert the pair into the context
     self.__ctx.add_value(test_utils.make_key(p[1]), PAIR_STRING)
     # Remove the first read from the pair, reorder so that the None is at index 1,
     # the serialize and insert into the context.
     test_utils.erase_read1(p)
     self.__ctx.add_value(test_utils.make_key(p[1]), proto.serialize_pair( (p[1], None) ))
     self.__reducer.reduce(self.__ctx)
     # now ensure that nothing was emitted.  The pair isn't emitted because
     # the key refers to read2, and the fragment isn't emitted because it's a duplicate of
     # the one in the pair.
     self.assertEqual(0, len(self.__ctx.emitted.keys()))
     # check counter
     self.assertFalse(self.__ctx.counters.has_key(self.__pair_counter_name()))
     self.assertTrue(self.__ctx.counters.has_key(self.__frag_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #28
0
 def test_fragment_with_duplicate_in_pair_2(self):
     # Ensure the reducer catches a fragment duplicate of pair[1].
     p = list(test_utils.pair1())
     # Insert the pair into the context
     self.__ctx.add_value(test_utils.make_key(p[1]), PAIR_STRING)
     # Remove the first read from the pair, reorder so that the None is at index 1,
     # the serialize and insert into the context.
     test_utils.erase_read1(p)
     self.__ctx.add_value(test_utils.make_key(p[1]),
                          proto.serialize_pair((p[1], None)))
     self.__reducer.reduce(self.__ctx)
     # now ensure that nothing was emitted.  The pair isn't emitted because
     # the key refers to read2, and the fragment isn't emitted because it's a duplicate of
     # the one in the pair.
     self.assertEqual(0, len(self.__ctx.emitted.keys()))
     # check counter
     self.assertFalse(
         self.__ctx.counters.has_key(self.__pair_counter_name()))
     self.assertTrue(self.__ctx.counters.has_key(
         self.__frag_counter_name()))
     self.assertEqual(1, self.__ctx.counters[self.__frag_counter_name()])
Пример #29
0
 def __pipe_pair_through(self, pair):
     message = io.serialize_pair(pair)
     return io.unserialize_pair(message)
Пример #30
0
 def __pipe_pair_through(self, pair):
     message = io.serialize_pair(pair)
     return io.unserialize_pair(message)
Пример #31
0
 def test_empty_read1(self):
     # Ensure the reducer raises an exception if the pair[0] is None
     p = test_utils.erase_read1(list(test_utils.pair1()))
     self.__ctx.add_value(test_utils.make_key(p[1]),
                          proto.serialize_pair(p))
     self.assertRaises(ValueError, self.__reducer.reduce, self.__ctx)
Пример #32
0
 def test_empty_read1(self):
     # Ensure the reducer raises an exception if the pair[0] is None
     p = test_utils.erase_read1(list(test_utils.pair1()))
     self.__ctx.add_value(test_utils.make_key(p[1]), proto.serialize_pair(p))
     self.assertRaises(ValueError, self.__reducer.reduce, self.__ctx)