Exemplo n.º 1
0
 def testMatchTwoAfter(self):
     """
     Test that an inserted base two sites after a non-inserted site
     returns the correct offset.
     """
     self.assertEqual(
         (True, 11),
         insertionOffset(2,
                         ((0, 10), (1, None), (2, None)),
                         (CMATCH, CINS, CINS)))
Exemplo n.º 2
0
 def testMatchTwoBefore(self):
     """
     Test that an inserted base two sites before a non-inserted site
     returns the correct offset.
     """
     self.assertEqual(
         (False, 10),
         insertionOffset(0,
                         ((0, None), (1, None), (2, 10)),
                         (CINS, CINS, CMATCH)))
Exemplo n.º 3
0
 def testMatchTwoAfterThenHardClips(self):
     """
     Test that an inserted base two sites after a non-inserted site
     returns the correct offset, including when there are also hard clips.
     """
     self.assertEqual(
         (True, 11),
         insertionOffset(
             2,
             ((0, 10), (1, None), (2, None), (3, None), (4, None)),
             (CMATCH, CINS, CINS, CHARD_CLIP, CHARD_CLIP)))
Exemplo n.º 4
0
 def testMatchTwoBeforeWithHardClips(self):
     """
     Test that an inserted base two sites before a non-inserted site
     returns the correct offset.
     """
     self.assertEqual(
         (False, 10),
         insertionOffset(
             2,
             ((0, None), (1, None), (2, None), (3, None), (4, 10)),
             (CHARD_CLIP, CHARD_CLIP, CINS, CINS, CMATCH)))
Exemplo n.º 5
0
def addPairsInfo(pairs, cigarOperations, query, qualities, referenceLength,
                 includeSoftClipped, correspondences, deletions, insertions):
    """
    Add information about matched pairs of nucleotides.

    @param pairs: A C{list} of 2-C{tuple}s of query offset, reference offset.
        Either (but not both) member of each tuple might be C{None} to indicate
        an indel mismatch.
    @param cigarOperations: A C{list} of CIGAR operations corresponding to the
        information in C{pairs}.
    @param query: A C{str} query DNA sequence.
    @param qualities: A C{list} of quality scores.
    @param includeSoftClipped: Include information from read bases that were
        marked as soft-clipped by the algorithm that made the BAM file.
    @param correspondences: A C{defaultdict(list)}, to hold (base, quality)
        scores for when a query offset corresponds to a reference offset.
    @param deletions: A C{set} of C{int} reference offsets that are deleted in
        the query.
    @param insertions: A C{defaultdict(list)}, to hold (base, quality)
        scores for when a query contains an insertion to the reference.
    """
    assert len(pairs) == len(cigarOperations)
    assert not any(pair == (None, None) for pair in pairs)

    inInsertion = False

    for count, ((queryOffset, referenceOffset),
                cigarOperation) in enumerate(zip(pairs, cigarOperations)):

        if queryOffset is None:
            # The query is missing something that is in the reference. So this
            # is a deletion from the reference.
            assert cigarOperation == CDEL
            assert referenceOffset is not None
            deletions[referenceOffset] += 1
            inInsertion = False

        elif referenceOffset is None:
            base = query[queryOffset]
            quality = qualities[queryOffset]

            if cigarOperation == CINS:
                # The query has an insertion (relative to the reference).

                # A CIGAR string shouldn't start with an insertion, IMO.
                # Rather, in such a case, it must start with unmatched
                # (soft-clipped) bases.
                # assert lastReferenceOffset is not None

                lookedBack, iOffset = insertionOffset(count, pairs,
                                                      cigarOperations)
                if not inInsertion:
                    inInsertion = True
                    if iOffset not in insertions:
                        insertions[iOffset] = Insertion(iOffset)
                    insertions[iOffset].start(iOffset if lookedBack else None)

                insertions[iOffset].append(base, quality)
            else:
                assert cigarOperation == CSOFT_CLIP
                inInsertion = False
                if includeSoftClipped:
                    correspondences[softClippedOffset(count, pairs,
                                                      cigarOperations)].append(
                                                          base, quality)
        else:
            # Query and reference offsets are both non-None.
            assert cigarOperation in CONSUMES_REFERENCE
            inInsertion = False
            base = query[queryOffset]
            quality = qualities[queryOffset]
            correspondences[referenceOffset].append(base, quality)