def testNoOffsets(self): """ If an empty set of wanted offsets is passed, the result must be empty. """ self.assertEqual( { 'match': { 'identicalMatchCount': 0, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ATT-T'), Read('id2', 'A-GTC'), offsets=set()))
def testMatchWithAmbiguityAndNotStrict(self): """ Two sequences that match exactly, apart from one ambiguity in the first sequence, must compare as expected when we specify matchAmbiguous=True to allow ambiguous matching. """ read1 = Read('id1', 'ACGTTS') read2 = Read('id2', 'ACGTTC') match = compareDNAReads(read1, read2, matchAmbiguous=True) self.assertEqual( '''\ Exact matches: 5/6 (83.33%) Ambiguous matches: 1/6 (16.67%) Exact or ambiguous matches: 6/6 (100.00%) Mismatches: 0 Not involving gaps (i.e., conflicts): 0 Involving a gap in one sequence: 0 Involving a gap in both sequences: 0 Id: id1 Length: 6 Gaps: 0 Ambiguous: 1/6 (16.67%) Id: id2 Length: 6 Gaps: 0 Ambiguous: 0''', matchToString(match, read1, read2, matchAmbiguous=True) )
def collectData(reads1, reads2, square, matchAmbiguous): """ Get pairwise matching statistics for two sets of reads. @param reads1: An C{OrderedDict} of C{str} read ids whose values are C{Read} instances. These will be the rows of the table. @param reads2: An C{OrderedDict} of C{str} read ids whose values are C{Read} instances. These will be the columns of the table. @param square: If C{True} we are making a square table of a set of sequences against themselves (in which case we show nothing on the diagonal). @param matchAmbiguous: If C{True}, count ambiguous nucleotides that are possibly correct as actually being correct. Otherwise, we are strict and insist that only non-ambiguous nucleotides can contribute to the matching nucleotide count. """ result = defaultdict(dict) for id1, read1 in reads1.items(): for id2, read2 in reads2.items(): if id1 != id2 or not square: match = compareDNAReads( read1, read2, matchAmbiguous=matchAmbiguous)['match'] if not matchAmbiguous: assert match['ambiguousMatchCount'] == 0 result[id1][id2] = result[id2][id1] = match return result
def testGapAmbiguous(self): """ Testing that the ambiguousOffset shows ambiguous characters paired with gaps as expected """ self.assertEqual( { 'match': { 'identicalMatchCount': 2, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 1, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [1], 'extraCount': 0, 'gapOffsets': [2, 3], }, 'read2': { 'ambiguousOffsets': [3], 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareDNAReads(Read('id1', 'AN--T'), Read('id2', 'A--NT')))
def testExtraAmbiguous(self): """ If the first sequence has extra bases which are ambiguous,they must be indicated in the extraCount and in the ambiguousOffset. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [6], 'extraCount': 2, 'gapOffsets': [5], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT-N'), Read('id2', 'ACGTT')))
def testNonMatchingAmbiguityInFirst(self): """ Two sequences that match exactly, apart from one (incompatible) ambiguity in the second sequence, must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTC')))
def testMatchWithIdenticalAmbiguityButStrict(self): """ Two sequences that match exactly, including one (identical) ambiguity at the same location in the sequence, must compare as expected. Strict. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTN'), Read('id2', 'ACGTTN'), matchAmbiguous=False))
def testExcludeGapLocations(self): """ If gap locations are not wanted, they should not appear in the result of a call to matchToString. """ read1 = Read('id1', 'TTTTTAAAAAAGCGCG') read2 = Read('id2', 'TTTTT------GCGCG') match = compareDNAReads(read1, read2) self.maxDiff = None self.assertEqual( '''\ Exact matches: 10/16 (62.50%) Ambiguous matches: 0 Mismatches: 6/16 (37.50%) Not involving gaps (i.e., conflicts): 0 Involving a gap in one sequence: 6/16 (37.50%) Involving a gap in both sequences: 0 Id: id1 Length: 16 Gaps: 0 Ambiguous: 0 Id: id2 Length: 16 Gaps: 6/16 (37.50%) Ambiguous: 0''', matchToString(match, read1, read2, includeGapLocations=False) )
def testNoOffsets(self): """ If an empty set of wanted offsets is passed, the result must be empty. """ self.assertEqual( { 'match': { 'identicalMatchCount': 0, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ATT-T'), Read('id2', 'A-GTC'), offsets=set()))
def testEmptySequences(self): """ Two empty sequences must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 0, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', ''), Read('id2', '')))
def testGapLocations(self): """ Gap locations must be returned correctly. """ read1 = Read('id1', 'TTTTTAAAAAAGCGCG') read2 = Read('id2', 'TTTTT------GCGCG') match = compareDNAReads(read1, read2) self.maxDiff = None self.assertEqual( '''\ Exact matches: 10/16 (62.50%) Ambiguous matches: 0 Mismatches: 6/16 (37.50%) Not involving gaps (i.e., conflicts): 0 Involving a gap in one sequence: 6/16 (37.50%) Involving a gap in both sequences: 0 Id: id1 Length: 16 Gaps: 0 Ambiguous: 0 Id: id2 Length: 16 Gaps: 6/16 (37.50%) Gap locations (1-based): 6, 7, 8, 9, 10, 11 Ambiguous: 0''', matchToString(match, read1, read2) )
def testGapGap(self): """ Coinciding gaps in the sequences must be dealt with correctly """ self.assertEqual( { 'match': { 'identicalMatchCount': 2, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 1, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [2, 3], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareDNAReads(Read('id1', 'AC--T'), Read('id2', 'A--TT')))
def testGapInSecond(self): """ A gap in the second sequence must be dealt with correctly """ self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'A--TT')))
def testExactMatch(self): """ Two sequences that match exactly must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGTT')))
def testOffsets(self): """ If a set of wanted offsets is passed, the result must be restricted to just those offsets. """ self.assertEqual( { 'match': { 'identicalMatchCount': 1, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ATT-T'), Read('id2', 'A-GTC'), offsets=set([0, 4])))
def testOffsets(self): """ If a set of wanted offsets is passed, the result must be restricted to just those offsets. """ self.assertEqual( { 'match': { 'identicalMatchCount': 1, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ATT-T'), Read('id2', 'A-GTC'), offsets=set([0, 4])))
def testMatchWithAmbiguityButStrict(self): """ Two sequences that match exactly, apart from one ambiguity in the first sequence, must compare as expected when we specify matchAmbiguous=False to disallow ambiguous matching. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTS'), Read('id2', 'ACGTTC'), matchAmbiguous=False))
def testMatchWithIdenticalAmbiguityButStrict(self): """ Two sequences that match exactly, including one (identical) ambiguity at the same location in the sequence, must compare as expected. Strict. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTN'), Read('id2', 'ACGTTN'), matchAmbiguous=False))
def testMatchWithIncompatibleAmbiguityInBoth(self): """ Two sequences that match exactly, apart from one (incompatible) ambiguity at the same location in the sequence, must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTS')))
def testGapAmbiguous(self): """ Testing that the ambiguousOffset shows ambiguous characters paired with gaps as expected """ self.assertEqual( { 'match': { 'identicalMatchCount': 2, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 1, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [1], 'extraCount': 0, 'gapOffsets': [2, 3], }, 'read2': { 'ambiguousOffsets': [3], 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareDNAReads(Read('id1', 'AN--T'), Read('id2', 'A--NT')))
def testNonDefaultGapChars(self): """ We must be able to specify the gap characters. """ for gap in '+$': self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [2], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [0], }, }, compareDNAReads(Read('id1', 'AC%sTT' % gap), Read('id2', '%sCGTT' % gap), gapChars='+$'))
def testExtraInSecond(self): """ If the second sequence has extra bases, they must be indicated in the extraCount. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 2, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGTTCC')))
def testExactMatch(self): """ Two sequences that match exactly must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGTT')))
def testExtraAmbiguous(self): """ If the first sequence has extra bases which are ambiguous,they must be indicated in the extraCount and in the ambiguousOffset. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [6], 'extraCount': 2, 'gapOffsets': [5], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT-N'), Read('id2', 'ACGTT')))
def testMatchWithAmbiguityInSecond(self): """ Two sequences that match exactly, apart from one ambiguity in the second sequence, must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 1, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTC'), Read('id2', 'ACGTTS')))
def testGapInSecond(self): """ A gap in the second sequence must be dealt with correctly """ self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'A--TT')))
def testMatchWithIncompatibleAmbiguityInBoth(self): """ Two sequences that match exactly, apart from one (incompatible) ambiguity at the same location in the sequence, must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTS')))
def testGapGap(self): """ Coinciding gaps in the sequences must be dealt with correctly """ self.assertEqual( { 'match': { 'identicalMatchCount': 2, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 1, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [2, 3], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareDNAReads(Read('id1', 'AC--T'), Read('id2', 'A--TT')))
def testNonDefaultGapChars(self): """ We must be able to specify the gap characters. """ for gap in '+$': self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 2, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [2], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [0], }, }, compareDNAReads(Read('id1', 'AC%sTT' % gap), Read('id2', '%sCGTT' % gap), gapChars='+$'))
def testMismatch(self): """ If the sequences have mismatched (non-ambiguous) bases, their count must be given correctly in the nonGapMismatchCount. """ self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 2, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGCC')))
def testExtraInSecond(self): """ If the second sequence has extra bases, they must be indicated in the extraCount. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 2, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGTTCC')))
def testEmptySequences(self): """ Two empty sequences must compare as expected. """ self.assertEqual( { 'match': { 'identicalMatchCount': 0, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', ''), Read('id2', '')))
def testMatchWithAmbiguityAndNotStrict(self): """ Two sequences that match exactly, apart from one ambiguity in the first sequence, must compare as expected when we specify matchAmbiguous=True to allow ambiguous matching. """ read1 = Read('id1', 'ACGTTS') read2 = Read('id2', 'ACGTTC') match = compareDNAReads(read1, read2, matchAmbiguous=True) self.assertEqual( '''\ Exact matches: 5/6 (83.33%) Ambiguous matches: 1/6 (16.67%) Exact or ambiguous matches: 6/6 (100.00%) Mismatches: 0 Not involving gaps (i.e., conflicts): 0 Involving a gap in one sequence: 0 Involving a gap in both sequences: 0 Id: id1 Length: 6 Gaps: 0 Ambiguous: 1/6 (16.67%) Id: id2 Length: 6 Gaps: 0 Ambiguous: 0''', matchToString(match, read1, read2, matchAmbiguous=True))
def testMismatch(self): """ If the sequences have mismatched (non-ambiguous) bases, their count must be given correctly in the nonGapMismatchCount. """ self.assertEqual( { 'match': { 'identicalMatchCount': 3, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 2, }, 'read1': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGCC')))
def testMatchWithAmbiguityButStrict(self): """ Two sequences that match exactly, apart from one ambiguity in the first sequence, must compare as expected when we specify matchAmbiguous=False to disallow ambiguous matching. """ self.assertEqual( { 'match': { 'identicalMatchCount': 5, 'ambiguousMatchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'ambiguousOffsets': [5], 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'ambiguousOffsets': [], 'extraCount': 0, 'gapOffsets': [], }, }, compareDNAReads(Read('id1', 'ACGTTS'), Read('id2', 'ACGTTC'), matchAmbiguous=False))
def collectData(reads1, reads2, square, matchAmbiguous): """ Get pairwise matching statistics for two sets of reads. @param reads1: An C{OrderedDict} of C{str} read ids whose values are C{Read} instances. These will be the rows of the table. @param reads2: An C{OrderedDict} of C{str} read ids whose values are C{Read} instances. These will be the columns of the table. @param square: If C{True} we are making a square table of a set of sequences against themselves (in which case we show nothing on the diagonal). @param matchAmbiguous: If C{True}, count ambiguous nucleotides that are possibly correct as actually being correct. Otherwise, we are strict and insist that only non-ambiguous nucleotides can contribute to the matching nucleotide count. """ result = defaultdict(dict) for id1, read1 in reads1.items(): for id2, read2 in reads2.items(): if id1 != id2 or not square: match = compareDNAReads(read1, read2, matchAmbiguous=matchAmbiguous)['match'] if not matchAmbiguous: assert match['ambiguousMatchCount'] == 0 result[id1][id2] = result[id2][id1] = match return result
def processFeature(featureName, features, genome, fps, featureNumber, args): """ Process a feature from a genome. @param featureName: A C{str} feature name. @param features: A C{Features} instance. @param genome: A C{SARS2Genome} instance. @param fps: A C{dict} of file pointers for the various output streams. @param featureNumber: The C{int} 0-based count of the features requested. This will be zero for the first feature, 1 for the second, etc. @param args: A C{Namespace} instance as returned by argparse with values for command-line options. """ result = genome.feature(featureName) feature = features.getFeature(featureName) referenceNt, genomeNt = result.ntSequences() referenceAa, genomeAa = result.aaSequences() newlineNeeded = False if args.printNtMatch: fp = fps['nt-match'] if featureNumber: print(file=fp) print(f'Feature: {featureName} nucleotide match', file=fp) print(f' Reference nt location {feature["start"] + 1}, genome nt ' f'location {result.genomeOffset + 1}', file=fp) match = compareDNAReads(referenceNt, genomeNt) print(dnaMatchToString(match, referenceNt, genomeNt, matchAmbiguous=False, indent=' '), file=fp) printDiffs(referenceNt, genomeNt, True, feature['start'], fp, indent=' ') newlineNeeded = True if args.printAaMatch: fp = fps['aa-match'] if newlineNeeded or featureNumber: print(file=fp) print(f'Feature: {featureName} amino acid match', file=fp) match = compareAaReads(referenceAa, genomeAa) print(aaMatchToString(match, referenceAa, genomeAa, indent=' '), file=fp) printDiffs(referenceAa, genomeAa, False, feature['start'], fp, indent=' ') if args.printNtSequence: noGaps = Read(genomeNt.id, genomeNt.sequence.replace('-', '')) Reads([noGaps]).save(fps['nt-sequence']) if args.printAaSequence: noGaps = Read(genomeAa.id, genomeAa.sequence.replace('-', '')) Reads([noGaps]).save(fps['aa-sequence']) if args.printNtAlignment: Reads([genomeNt, referenceNt]).save(fps['nt-align']) if args.printAaAlignment: Reads([genomeAa, referenceAa]).save(fps['aa-align'])
def _writeOverallResultSummarySummary(self, results, outputDir): """ Write a summary of the summary of the overall results. @param results: A C{dict} of C{dicts}. Keyed by C{str} short alignment file name, then C{str} short reference name, and with values being C{dict}s with signifcant offsets and best consensus sequence for the corresponding reference in the alignment file. """ filename = join(outputDir, 'result-summary-summary.txt') self.report('Writing overall result summary summary to', filename) bestFraction = 0.0 bestAlignmentReference = [] with open(filename, 'w') as fp: for alignmentFilename in sorted(results): print(alignmentFilename, file=fp) resultSummary = [] for referenceId in sorted(results[alignmentFilename]): result = results[alignmentFilename][referenceId] referenceRead = self.referenceGenomes[referenceId] consensusRead = result['consensusRead'] match = compareDNAReads(referenceRead, consensusRead)['match'] matchCount = (match['identicalMatchCount'] + match['ambiguousMatchCount']) fraction = matchCount / len(referenceRead) if fraction > bestFraction: bestFraction = fraction bestAlignmentReference = [(alignmentFilename, referenceId)] elif fraction == bestFraction: bestAlignmentReference.append( (alignmentFilename, referenceId)) resultSummary.append( (fraction, ' %s: %d/%d (%.2f%%)' % (referenceId, matchCount, len(referenceRead), fraction * 100.0))) # Sort the result summary by decreasing nucleotide identity # fraction. resultSummary.sort(reverse=True) for fraction, summary in resultSummary: print(summary, file=fp) print(file=fp) print('Best match%s (%.2f%%):' % ('' if len(bestAlignmentReference) == 1 else 'es', bestFraction * 100.0), file=fp) for alignmentFilename, referenceId in bestAlignmentReference: print(' %s: %s' % (alignmentFilename, referenceId), file=fp)
def _writeOverallResultSummary(self, results, outputDir): """ Write a summary of the overall results. @param results: A C{dict} of C{dicts}. Keyed by C{str} short alignment file name, then C{str} short reference name, and with values being C{dict}s with signifcant offsets and best consensus sequence for the corresponding reference in the alignment file. """ filename = join(outputDir, 'result-summary.txt') self.report('Writing overall result summary to', filename) with open(filename, 'w') as fp: for alignmentFilename in sorted(results): print('Alignment file', alignmentFilename, file=fp) for referenceId in sorted(results[alignmentFilename]): result = results[alignmentFilename][referenceId] referenceRead = self.referenceGenomes[referenceId] consensusRead = result['consensusRead'] genomeLength = len(referenceRead) significantOffsets = result['significantOffsets'] print('\n Reference %s (length %d)' % (referenceId, genomeLength), file=fp) print(' %d significant offsets found.' % len(significantOffsets), file=fp) # Overall match. match = compareDNAReads(referenceRead, consensusRead) print('\n Overall match of reference with consensus:', file=fp) print(matchToString(match, referenceRead, consensusRead, indent=' '), file=fp) # Significant sites match. match = compareDNAReads(referenceRead, consensusRead, offsets=significantOffsets) print('\n Match of reference with consensus at ' '%d SIGNIFICANT sites:' % len(significantOffsets), file=fp) print(matchToString(match, referenceRead, consensusRead, indent=' ', offsets=significantOffsets), file=fp) # Non-significant sites match. nonSignificantOffsets = (set(range(genomeLength)) - set(significantOffsets)) match = compareDNAReads(referenceRead, consensusRead, offsets=nonSignificantOffsets) print('\n Match of reference with consensus at ' '%d NON-SIGNIFICANT sites:' % len(nonSignificantOffsets), file=fp) print(matchToString(match, referenceRead, consensusRead, indent=' ', offsets=nonSignificantOffsets), file=fp)
def saveClosestReferenceConsensus(self, referenceId, components, baseCountAtOffset, genomeLength, alignedReads, referenceInsertions, outputDir): """ Calculate and save the best consensus to a reference genome. @param referenceId: The C{str} id of the reference sequence. @param components: A C{list} of C{ComponentByOffsets} instances. @param baseCountAtOffset: A C{list} of C{Counter} instances giving the count of each nucleotide at each genome offset. @param genomeLength: The C{int} length of the genome the reads were aligned to. @param alignedReads: A list of C{AlignedRead} instances. @param referenceInsertions: A C{dict} keyed by read id (the read that would cause a reference insertion). The values are lists of 2-tuples, with each 2-tuple containing an offset into the reference sequence and the C{str} of nucleotide that would be inserted starting at that offset. @param outputDir: A C{str} directory path. @return: A tuple of (consensus, unwantedReads, wantedCcReadCount, wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset). """ def ccMatchCount(cc, reference, drawFp, drawMessage): """ Count the matches between a consistent component and a reference genome. @param cc: A C{ConsistentComponent} instance. @param reference: A C{Read} instance. @param drawFp: A file pointer to write information about draws (if any) to. @param drawMessage: A C{str} message to write to C{drawFp}. If the string contains '%(baseCounts)s' that will be replaced by a string representation of the base counts (in C{counts}) obtained from C{baseCountsToStr}. If not, the base count info will be printed after the message. @return: The C{int} count of bases that match the reference for the offsets covered by the consistent component. """ referenceSequence = reference.sequence nucleotides = cc.nucleotides count = 0 for offset in nucleotides: message = (drawMessage + (' location %d: base counts' % (offset + 1)) + ' %(baseCounts)s.') referenceBase = referenceSequence[offset] componentBase = commonest(nucleotides[offset], referenceBase, drawFp=drawFp, drawMessage=message) count += int(componentBase == referenceBase) return count def sortedConsistentComponent(component, reference, fp): """ Sort the consistent components in the given C{ComponentByOffsets} instance according to how well they match the passed reference. The sort order is by increasing match score, so the best consistent component is last. @param component: A C{ComponentByOffsets} instance. @param reference: A C{Read} instance. @param fp: A file pointer to write information to. @return: The C{int} index of the best consistent component. """ result = [] for index, cc in enumerate(component.consistentComponents): matchCount = ccMatchCount( cc, reference, fp, ' Consistent component %d base draw' % (index + 1)) score = matchCount / len(cc.nucleotides) print(' Consistent component %d (%d reads) has %d exact ' 'matches with the reference, out of the %d offsets it ' 'covers (%.2f%%).' % (index + 1, len(cc.reads), matchCount, len( cc.nucleotides), score * 100.0), file=fp) result.append((score, len(cc.nucleotides), index, cc)) result.sort() return result reference = self.referenceGenomes[referenceId] fields = reference.id.split(maxsplit=1) if len(fields) == 1: referenceIdRest = '' else: referenceIdRest = ' ' + fields[1] infoFile = join(outputDir, 'reference-consensus.txt') self.report(' Saving closest consensus to reference info to', infoFile) with open(infoFile, 'w') as infoFp: print('Building consensus at significant offsets.', file=infoFp) consensus = [None] * genomeLength offsetsDone = set() wantedReads = set() unwantedReads = set() for count, component in enumerate(components, start=1): print('\nExamining component %d with %d locations: %s' % (count, len(component.offsets), commas(map(lambda offset: offset + 1, component.offsets))), file=infoFp) componentOffsets = set(component.offsets) sortedCcs = sortedConsistentComponent(component, reference, infoFp) while componentOffsets - offsetsDone: # The following pop call will raise an IndexError if # the sorted cc list is empty. But if it's empty we # shouldn't be here, because the set of included # offsets should at that point include everything in # this component. Having the naked pop here ensures we # get an exception if this assumption is incorrect. # It's like having an assert to test that we found all # the component's offsets following the loop. score, _, ccIndex, cc = sortedCcs.pop() print(' Incorporating nucleotides from consistent ' 'component %d (%d reads, score %.2f, covering %d ' 'locations (%d still undecided in consensus)) to ' 'consensus.' % (ccIndex + 1, len( cc.reads), score, len(cc.nucleotides), len(set(cc.nucleotides) - offsetsDone)), file=infoFp) wantedReads |= cc.reads for offset in sorted(cc.nucleotides): if offset in offsetsDone: continue nucleotides = cc.nucleotides[offset] referenceBase = reference.sequence[offset] base = commonest( nucleotides, referenceBase, drawFp=infoFp, drawMessage=(' WARNING: base count draw at ' 'location %d ' % (offset + 1)) + ' %(baseCounts)s.') assert consensus[offset] is None consensus[offset] = base offsetsDone.add(offset) # Do some reporting on the base just added. if base == referenceBase: mismatch = '' else: consensusBase = commonest( baseCountAtOffset[offset], referenceBase, drawFp=infoFp, drawMessage=( ' WARNING: consensus base count ' 'draw at location %d ' % (offset + 1)) + ' %(baseCounts)s.') mismatch = ( ' (mismatch: reference has %s, all-read ' 'consensus has %s)' % (referenceBase, consensusBase)) print(' Location %d: %s from nucleotides %s%s' % (offset + 1, base, nucleotides.baseCountsToStr(), mismatch), file=infoFp) # Print info about the cccs that were not needed to cover # all the offsets in this cc. Reverse the list so we print # them in decreasing match score order. for score, _, ccIndex, cc in reversed(sortedCcs): unwantedReads |= cc.reads print(' Will NOT incorporate nucleotides from consistent ' 'component %d (%d reads, score %.2f, covering %d ' 'locations) to consensus.' % (ccIndex + 1, len( cc.reads), score, len(cc.nucleotides)), file=infoFp) # Get the base counts at each offset, from the full set of # aligned reads minus the reads in cccs we're not using. (wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset, _) = gatherData(genomeLength, set(alignedReads) - unwantedReads) # Process the insignificant offsets, based on all reads EXCEPT # those not used in the connected components. offsetsToTry = sorted(set(range(genomeLength)) - offsetsDone) print('\nAttempting to add bases from %d non-significant ' 'consensus locations, using all reads, EXCEPT those ' 'belonging to unused consistent components:' % len(offsetsToTry), file=infoFp) for offset in offsetsToTry: assert consensus[offset] is None baseCount = wantedReadsBaseCountAtOffset[offset] if baseCount: referenceBase = reference.sequence[offset] base = commonest( baseCount, referenceBase, drawFp=infoFp, drawMessage=( ' WARNING: consensus base count draw at ' 'location %d' % (offset + 1)) + ' %(baseCounts)s.') print(' Location %d: %s from nucleotides %s' % (offset + 1, base, baseCountsToStr(baseCount)), file=infoFp, end='') if base == referenceBase: print(file=infoFp) else: print(' (mismatch: reference has %s)' % referenceBase, file=infoFp) consensus[offset] = base offsetsDone.add(offset) # Process remaining insignificant offsets, using ALL reads # (i.e., including those in cccs that we wanted to avoid # using). At this point, this is the best we can do with these # final offsets (otherwise we will get gaps - which in some # cases may actually might be preferable because the reference # sequence may not be fully covered by the actual infection # sequence). offsetsToTry = sorted(set(range(genomeLength)) - offsetsDone) print('\nAttempting to add bases from %d non-significant ' 'consensus locations, using all reads, INCLUDING those ' 'belonging to unused consistent components:' % len(offsetsToTry), file=infoFp) for offset in offsetsToTry: assert consensus[offset] is None referenceBase = reference.sequence[offset] baseCount = baseCountAtOffset[offset] if baseCount: base = commonest( baseCount, referenceBase, drawFp=infoFp, drawMessage=( ' WARNING: consensus base count draw at ' 'location %d' % (offset + 1)) + ' %(baseCounts)s.') print(' Location %d: %s from nucleotides %s' % (offset + 1, base, baseCountsToStr(baseCount)), file=infoFp, end='') else: # The reads did not cover this offset. base = '-' print(' Location %d: -' % (offset + 1), file=infoFp, end='') if base == referenceBase: print(file=infoFp) else: print(' (mismatch: reference has %s)' % referenceBase, file=infoFp) consensus[offset] = base offsetsDone.add(offset) # Sanity check: make sure we processed all offsets. assert offsetsDone == set(range(genomeLength)) consensusId = ( '%s-consensus%s' % (self.shortReferenceId[referenceId], referenceIdRest)) consensus = Read(consensusId, ''.join(consensus)) # Print details of the match of the consensus to the reference. match = compareDNAReads(reference, consensus) print('\nOVERALL match with reference:', file=infoFp) print(matchToString(match, reference, consensus, indent=' '), file=infoFp) # Print any insertions to the reference. wantedReadsWithInsertions = (set(referenceInsertions) & (set(alignedReads) - unwantedReads)) if wantedReadsWithInsertions: print('\nReference insertions present in %d read%s:' % (len(wantedReadsWithInsertions), s(len(wantedReadsWithInsertions))), file=infoFp) nucleotides = defaultdict(Counter) for readId in wantedReadsWithInsertions: for (offset, sequence) in referenceInsertions[readId]: for index, base in enumerate(sequence): nucleotides[offset + index][base] += 1 print(nucleotidesToStr(nucleotides, prefix=' '), file=infoFp) else: print('\nReference insertions: none.', file=infoFp) filename = join(outputDir, 'reference-consensus.fasta') self.report(' Saving consensus to', filename) Reads([consensus]).save(filename) wantedCcReadCount = 0 filename = join(outputDir, 'cc-wanted.fastq') with open(filename, 'w') as fp: for wantedCcRead in wantedReads: alignment = wantedCcRead.alignment if not (alignment.is_secondary or alignment.is_supplementary): wantedCcReadCount += 1 print(Read(alignment.query_name, alignment.query_sequence, alignmentQuality(alignment)).toString('fastq'), end='', file=fp) self.report( ' Saved %d read%s wanted in consistent connected components ' 'to %s' % (wantedCcReadCount, s(wantedCcReadCount), filename)) unwantedReads = set(alignedReads) - wantedReads return (consensus, unwantedReads, wantedCcReadCount, wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset)
def saveAlternateConsensus(self, referenceId, consensusRead, baseCountAtOffset, readCountAtOffset, genomeLength, outputDir): """ Calculate and save an alternate consensus to a reference genome. @param referenceId: The C{str} id of the reference sequence. @consensusRead: The C{dark.reads.Read} consensus sequence to calculate an alternative to. @param baseCountAtOffset: A C{list} of C{Counter} instances giving the count of each nucleotide at each genome offset. @param readCountAtOffset: A C{list} of C{int} counts of the total number of reads at each genome offset (i.e., just the sum of the values in C{baseCountAtOffset}) @param genomeLength: The C{int} length of the genome the reads were aligned to. @param outputDir: A C{str} directory path. @return: An alternate consensus C{dark.reads.Read} instance. """ filename = join(outputDir, 'reference-alternate-consensus.txt') self.report(' Writing alternate consensus info to', filename) alternateConsensus = [] referenceRead = self.referenceGenomes[referenceId] fields = referenceRead.id.split(maxsplit=1) if len(fields) == 1: referenceIdRest = '' else: referenceIdRest = ' ' + fields[1] with open(filename, 'w') as infoFp: print( 'The three nucleotides shown on each line are for the ' 'reference, the consensus (made by clustering) and the ' 'alternate nucleotide.\nThese are followed by up to two ' 'asterisks: the first if the alternate nucleotide does not ' 'agree with the reference, the second if it does not agree ' 'with the consensus.', file=infoFp) for offset in range(genomeLength): referenceBase = referenceRead.sequence[offset] consensusBase = consensusRead.sequence[offset] baseCount = baseCountAtOffset[offset] if baseCount: if len(baseCount) == 1: # Only one nucleotide was found at this location. # The reference doesn't necessarily agree with the # consensus here, since the aligned reads may have # had a different base at this site. base = consensusBase else: # Find the nucleotide with the highest count that # is not the consensus sequence nucleotide. orderedCounts = baseCount.most_common() alternateBase, alternateCount = [ x for x in orderedCounts if x[0] != consensusBase ][0] # Check that we found a base that's not the # consensus base. assert alternateBase != consensusBase # If the frequency of the alternate base is high # enough, go with it. Else take the base from the # original consensus. alternateFraction = (alternateCount / readCountAtOffset[offset]) if (alternateCount > 1 and alternateFraction > self.alternateNucleotideMinFreq): base = alternateBase else: base = consensusBase agreeWithReference = referenceBase == base agreeWithConsensus = consensusBase == base print('Location %d: %s %s %s %s %s nucleotides %s' % (offset + 1, referenceBase, consensusBase, base, ' ' if agreeWithReference else '*', ' ' if agreeWithConsensus else '*', baseCountsToStr(baseCount)), file=infoFp) else: # The reads did not cover this offset. base = '-' print(' Location %d: -' % (offset + 1), file=infoFp) alternateConsensus.append(base) alternateConsensusId = ( '%s-alternate-consensus%s' % (self.shortReferenceId[referenceId], referenceIdRest)) alternateConsensusRead = Read(alternateConsensusId, ''.join(alternateConsensus)) # Print details of the match of the alternate consensus to the # reference. match = compareDNAReads(referenceRead, alternateConsensusRead) print('\nAlternate consensus match with reference:', file=infoFp) print(matchToString(match, referenceRead, alternateConsensusRead, indent=' '), file=infoFp) # Print details of the match of the alternate consensus to the # original consensus. match = compareDNAReads(consensusRead, alternateConsensusRead) print('\nAlternate consensus match with original consensus:', file=infoFp) print(matchToString(match, consensusRead, alternateConsensusRead, indent=' '), file=infoFp) # Print details of the match of the original consensus to the # reference. match = compareDNAReads(referenceRead, consensusRead) print('\nOriginal consensus match with reference:', file=infoFp) print(matchToString(match, referenceRead, consensusRead, indent=' '), file=infoFp) filename = join(outputDir, 'reference-alternate-consensus.fasta') self.report(' Saving alternate consensus FASTA to', filename) Reads([alternateConsensusRead]).save(filename) return alternateConsensusRead
def processFeature(featureName, genome, fps, featureNumber, args): """ Process a feature from a genome. @param featureName: A C{str} feature name. @param genome: A C{SARS2Genome} instance. @param fps: A C{dict} of file pointers for the various output streams. @param featureNumber: The C{int} 0-based count of the features requested. This will be zero for the first feature, 1 for the second, etc. @param args: A C{Namespace} instance as returned by argparse with values for command-line options. """ referenceNt, genomeNt = genome.ntSequences(featureName) feature = genome.features[featureName] if args.printAaMatch or args.printAaSequence or args.printAaAlignment: try: referenceAa, genomeAa = genome.aaSequences(featureName) except TranslationError as e: if args.onError == 'raise': raise elif args.onError == 'print': print( f'Could not translate feature {featureName} in genome ' f'{genome.genome.id}: {e}', file=sys.stderr) referenceAa = genomeAa = None newlineNeeded = False if args.printNtMatch: fp = fps['nt-match'] if featureNumber: print(file=fp) print(f'Feature: {featureName} nucleotide match', file=fp) print(f' Reference nt location {feature["start"] + 1}', file=fp) match = compareDNAReads(referenceNt, genomeNt) print(dnaMatchToString(match, referenceNt, genomeNt, matchAmbiguous=False, indent=' '), file=fp) printDiffs(referenceNt, genomeNt, True, feature['start'], fp, indent=' ') newlineNeeded = True if args.printAaMatch and genomeAa: fp = fps['aa-match'] if newlineNeeded or featureNumber: print(file=fp) print(f'Feature: {featureName} amino acid match', file=fp) match = compareAaReads(referenceAa, genomeAa) print(aaMatchToString(match, referenceAa, genomeAa, indent=' '), file=fp) printDiffs(referenceAa, genomeAa, False, feature['start'], fp, indent=' ') if args.printNtSequence: noGaps = Read(genomeNt.id, genomeNt.sequence.replace('-', '')) Reads([noGaps]).save(fps['nt-sequence']) if args.printAaSequence and genomeAa: noGaps = Read(genomeAa.id, genomeAa.sequence.replace('-', '')) Reads([noGaps]).save(fps['aa-sequence']) if args.printNtAlignment: Reads([genomeNt, referenceNt]).save(fps['nt-align']) if args.printAaAlignment and genomeAa: Reads([genomeAa, referenceAa]).save(fps['aa-align'])
def plotConsistentComponents(referenceId, genomeLength, components, significantOffsets, outfile, infoFile, outputDir, title='xxx', show=False, titleFontSize=12, axisFontSize=12): """ Plot consistent connected components. """ def offsetsToLocationsStr(offsets): return ', '.join(map(lambda i: str(i + 1), sorted(offsets))) data = [] with open(infoFile, 'w') as fp: print('There are %d significant location%s: %s' % (len(significantOffsets), s(len(significantOffsets)), offsetsToLocationsStr(significantOffsets)), file=fp) for count, component in enumerate(components, start=1): print('Processing component %d, with %d consistent component%s' % (count, len(component.consistentComponents), s(len(component.consistentComponents))), file=fp) # Get the reference sequence for the component. reads = list( FastaReads( join(outputDir, 'component-%d-consensuses.fasta' % count))) reference = reads[0] length = len(reference) minOffset = min(component.offsets) maxOffset = max(component.offsets) print(' Offset range: %d to %d' % (minOffset + 1, maxOffset + 1), file=fp) # Add a top line to represent the reference. data.append( go.Scatter(x=(minOffset + 1, maxOffset + 1), y=(1.05, 1.05), hoverinfo='text', name=('Reference component %s' % count), text=('Reference component %s, %d offsets' % (count, len(component.offsets))))) # Add vertical lines at the start and end of this component. data.append( go.Scatter( x=(minOffset + 1, minOffset + 1), y=(-0.05, 1.05), mode='lines', hoverinfo='none', line={ 'color': '#eee', }, showlegend=False, )) data.append( go.Scatter( x=(maxOffset + 1, maxOffset + 1), y=(-0.05, 1.05), mode='lines', hoverinfo='none', line={ 'color': '#eee', }, showlegend=False, )) for ccCount, cc in enumerate(component.consistentComponents, start=1): ccSummary = ('Component read count %d, offsets covered %d/%d' % (len(cc.reads), len( cc.nucleotides), len(component.offsets))) # Get the consistent connected component consensus. consensus = reads[ccCount] assert ('consistent-component-%d' % ccCount) in consensus.id print(' Processing consistent component', ccCount, file=fp) print(' Component sequence:', consensus.sequence, file=fp) print(' %d offset%s: %s' % (len(cc.nucleotides), s(len(cc.nucleotides)), offsetsToLocationsStr(cc.nucleotides)), file=fp) match = compareDNAReads(reference, consensus) print(matchToString(match, reference, consensus, indent=' '), file=fp) identicalMatchCount = match['match']['identicalMatchCount'] ambiguousMatchCount = match['match']['ambiguousMatchCount'] # The match fraction will ignore gaps in the consensus # sequence as it is padded with '-' chars to align it to # the reference. fraction = (identicalMatchCount + ambiguousMatchCount) / ( length - len(match['read2']['gapOffsets'])) x = [] y = [fraction] * len(cc.nucleotides) text = [] identical = [] for index, offset in enumerate(sorted(component.offsets)): if offset in cc.nucleotides: consensusBase = consensus.sequence[index] referenceBase = reference.sequence[index] if consensusBase == referenceBase: identical.append(len(x)) # x axis values are 1-based (locations, not offsets) x.append(offset + 1) text.append( 'Location: %d, component: %s, reference: %s' '<br>Component nucleotides: %s<br>%s' % (offset + 1, consensusBase, referenceBase, baseCountsToStr( cc.nucleotides[offset]), ccSummary)) data.append( go.Scatter(x=x, y=y, hoverinfo='text', selectedpoints=identical, showlegend=False, text=text, mode='markers', selected={'marker': { 'color': 'blue', }}, unselected={'marker': { 'color': 'red', }})) # Add the significant offsets. n = len(significantOffsets) data.append( go.Scatter(x=[i + 1 for i in significantOffsets], y=[-0.05] * n, text=[ 'Location %d' % (offset + 1) for offset in significantOffsets ], hoverinfo='text', mode='markers', name='Significant locations')) layout = go.Layout( title=title, titlefont={ 'size': titleFontSize, }, xaxis={ 'range': (0, genomeLength + 1), 'title': 'Genome location', 'titlefont': { 'size': axisFontSize, }, }, yaxis={ 'range': (-0.1, 1.1), 'title': 'Nucleotide identity with reference sequence', 'titlefont': { 'size': axisFontSize, }, }, hovermode='closest', ) fig = go.Figure(data=data, layout=layout) plotly.offline.plot(fig, filename=outfile, auto_open=show, show_link=False)
# Align. reads = needle(reads) if args.alignmentFile: assert reads.save(args.alignmentFile) == 2 read1, read2 = reads len1, len2 = map(len, reads) identicalLengths = len1 == len2 # Sanity check. if args.align: assert identicalLengths result = compareDNAReads(read1, read2, matchAmbiguous=(not args.strict)) match = result['match'] identicalMatchCount = match['identicalMatchCount'] ambiguousMatchCount = match['ambiguousMatchCount'] gapMismatchCount = match['gapMismatchCount'] gapGapMismatchCount = match['gapGapMismatchCount'] nonGapMismatchCount = match['nonGapMismatchCount'] x = 'Post-alignment, sequence' if args.align else 'Sequence' if identicalLengths: print('%s lengths are identical: %s' % (x, len1)) else: print('%s lengths: %d, %d (difference %d)' % (x, len1, len2, abs(len1 - len2)))
if args.alignmentFile: assert reads.save(args.alignmentFile) == 2 offsets = (parseRangeString(args.sites, convertToZeroBased=True) if args.sites else None) read1, read2 = reads len1, len2 = map(len, reads) identicalLengths = len1 == len2 # Sanity check. if args.align: assert identicalLengths match = compareDNAReads(read1, read2, matchAmbiguous=(not args.strict), offsets=offsets) x = 'Post-alignment, sequence' if args.align else 'Sequence' if identicalLengths: print('%s lengths are identical: %s' % (x, len1)) else: print('%s lengths: %d, %d (difference %d)' % (x, len1, len2, abs(len1 - len2))) print(matchToString(match, read1, read2, matchAmbiguous=(not args.strict), offsets=offsets)) if args.showDiffs: # Print all sites where the sequences differ. width = int(log10(max(len1, len2))) + 1 headerPrinted = False
if args.alignmentFile: assert reads.save(args.alignmentFile) == 2 offsets = (parseRangeString(args.sites, convertToZeroBased=True) if args.sites else None) read1, read2 = reads len1, len2 = map(len, reads) identicalLengths = len1 == len2 # Sanity check. if args.align: assert identicalLengths match = compareDNAReads(read1, read2, matchAmbiguous=(not args.strict), offsets=offsets) x = 'Post-alignment, sequence' if args.align else 'Sequence' if identicalLengths: print('%s lengths are identical: %s' % (x, len1)) else: print('%s lengths: %d, %d (difference %d)' % (x, len1, len2, abs(len1 - len2))) print( matchToString(match, read1, read2, matchAmbiguous=(not args.strict), offsets=offsets))