def testGapAndMatch(self): """ Two sequences containing matches and gaps must compare as expected. """ read1 = AARead('id1', 'GALHN-') read2 = AARead('id2', 'GALHNA') match = compareAaReads(read1, read2) self.assertEqual( '''\ Matches: 5/6 (83.33%) Mismatches: 1/6 (16.67%) Not involving gaps (i.e., conflicts): 0 Involving a gap in one sequence: 1/6 (16.67%) Involving a gap in both sequences: 0 Id: id1 Length: 6 Gaps: 1/6 (16.67%) Gap locations (1-based): 6 Id: id2 Length: 6 Gaps: 0''', matchToString(match, read1, read2) )
def testOffsets(self): """ If a set of wanted offsets is passed, the result must be restricted to just those offsets. """ self.assertEqual( { 'match': { 'matchCount': 1, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 1, }, 'read1': { 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'extraCount': 0, 'gapOffsets': [], }, }, compareAaReads(AARead('id1', 'GAL-L'), AARead('id2', 'G-LHN'), offsets=set([0, 4])))
def testNonDefaultGapChars(self): """ We must be able to specify the gap characters. """ for gap in '+$': self.assertEqual( { 'match': { 'matchCount': 3, 'gapMismatchCount': 2, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'extraCount': 0, 'gapOffsets': [2], }, 'read2': { 'extraCount': 0, 'gapOffsets': [0], }, }, compareAaReads(AARead('id1', 'GA%sHN' % gap), AARead('id2', '%sALHN' % gap), gapChars='+$'))
def processFeature(featureName, features, genome, fps, featureNumber, args): """ Process a feature from a genome. @param featureName: A C{str} feature name. @param features: A C{Features} instance. @param genome: A C{SARS2Genome} instance. @param fps: A C{dict} of file pointers for the various output streams. @param featureNumber: The C{int} 0-based count of the features requested. This will be zero for the first feature, 1 for the second, etc. @param args: A C{Namespace} instance as returned by argparse with values for command-line options. """ result = genome.feature(featureName) feature = features.getFeature(featureName) referenceNt, genomeNt = result.ntSequences() referenceAa, genomeAa = result.aaSequences() newlineNeeded = False if args.printNtMatch: fp = fps['nt-match'] if featureNumber: print(file=fp) print(f'Feature: {featureName} nucleotide match', file=fp) print(f' Reference nt location {feature["start"] + 1}, genome nt ' f'location {result.genomeOffset + 1}', file=fp) match = compareDNAReads(referenceNt, genomeNt) print(dnaMatchToString(match, referenceNt, genomeNt, matchAmbiguous=False, indent=' '), file=fp) printDiffs(referenceNt, genomeNt, True, feature['start'], fp, indent=' ') newlineNeeded = True if args.printAaMatch: fp = fps['aa-match'] if newlineNeeded or featureNumber: print(file=fp) print(f'Feature: {featureName} amino acid match', file=fp) match = compareAaReads(referenceAa, genomeAa) print(aaMatchToString(match, referenceAa, genomeAa, indent=' '), file=fp) printDiffs(referenceAa, genomeAa, False, feature['start'], fp, indent=' ') if args.printNtSequence: noGaps = Read(genomeNt.id, genomeNt.sequence.replace('-', '')) Reads([noGaps]).save(fps['nt-sequence']) if args.printAaSequence: noGaps = Read(genomeAa.id, genomeAa.sequence.replace('-', '')) Reads([noGaps]).save(fps['aa-sequence']) if args.printNtAlignment: Reads([genomeNt, referenceNt]).save(fps['nt-align']) if args.printAaAlignment: Reads([genomeAa, referenceAa]).save(fps['aa-align'])
def testGapInSecond(self): """ A gap in the second sequence must be dealt with correctly. """ self.assertEqual( { 'match': { 'matchCount': 3, 'gapMismatchCount': 2, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareAaReads(AARead('id1', 'GALHN'), AARead('id2', 'G--HN')))
def testGapInFirst(self): """ A gap in the first sequence must be dealt with correctly. """ self.assertEqual( { 'match': { 'matchCount': 4, 'gapMismatchCount': 1, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'extraCount': 0, 'gapOffsets': [3], }, 'read2': { 'extraCount': 0, 'gapOffsets': [], }, }, compareAaReads(AARead('id1', 'GAL-N'), AARead('id2', 'GALHN')))
def testExactMatch(self): """ Two sequences that match exactly must compare as expected. """ self.assertEqual( { 'match': { 'matchCount': 5, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'extraCount': 0, 'gapOffsets': [], }, }, compareAaReads(AARead('id1', 'GALHN'), AARead('id2', 'GALHN')))
def testEmptySequences(self): """ Two empty sequences must compare as expected. """ self.assertEqual( { 'match': { 'matchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'extraCount': 0, 'gapOffsets': [], }, }, compareAaReads(AARead('id1', ''), AARead('id2', '')))
def testGapGap(self): """ Coinciding gaps in the sequences must be dealt with correctly """ self.assertEqual( { 'match': { 'matchCount': 2, 'gapMismatchCount': 2, 'gapGapMismatchCount': 1, 'nonGapMismatchCount': 0, }, 'read1': { 'extraCount': 0, 'gapOffsets': [2, 3], }, 'read2': { 'extraCount': 0, 'gapOffsets': [1, 2], }, }, compareAaReads(AARead('id1', 'GA--N'), AARead('id2', 'G--HN')))
def testMismatch(self): """ If the sequences have mismatched bases, their count must be given correctly in the nonGapMismatchCount. """ self.assertEqual( { 'match': { 'matchCount': 3, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 2, }, 'read1': { 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'extraCount': 0, 'gapOffsets': [], }, }, compareAaReads(AARead('id1', 'GALYY'), AARead('id2', 'GALHN')))
def testNoOffsets(self): """ If an empty set of wanted offsets is passed, the result must be empty. """ self.assertEqual( { 'match': { 'matchCount': 0, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'extraCount': 0, 'gapOffsets': [], }, }, compareAaReads(AARead('id1', 'GAL-N'), AARead('id2', 'G-LHN'), offsets=set()))
def testExtraInSecond(self): """ If the second sequence has extra bases, they must be indicated in the extraCount. """ self.assertEqual( { 'match': { 'matchCount': 5, 'gapMismatchCount': 0, 'gapGapMismatchCount': 0, 'nonGapMismatchCount': 0, }, 'read1': { 'extraCount': 0, 'gapOffsets': [], }, 'read2': { 'extraCount': 2, 'gapOffsets': [], }, }, compareAaReads(AARead('id1', 'GALHN'), AARead('id2', 'GALHNHN')))
if args.alignmentFile: assert reads.save(args.alignmentFile) == 2 offsets = (parseRangeString(args.sites, convertToZeroBased=True) if args.sites else None) read1, read2 = reads len1, len2 = map(len, reads) identicalLengths = len1 == len2 # Sanity check. if args.align: assert identicalLengths match = compareAaReads(read1, read2, offsets=offsets) x = 'Post-alignment, sequence' if args.align else 'Sequence' if identicalLengths: print('%s lengths are identical: %s' % (x, len1)) else: print('%s lengths: %d, %d (difference %d)' % (x, len1, len2, abs(len1 - len2))) print(matchToString(match, read1, read2, offsets=offsets)) if args.showDiffs: # Print all sites where the sequences differ. width = int(log10(max(len1, len2))) + 1 headerPrinted = False for site, (a, b) in enumerate(zip(read1.sequence, read2.sequence),
def processFeature(featureName, genome, fps, featureNumber, args): """ Process a feature from a genome. @param featureName: A C{str} feature name. @param genome: A C{SARS2Genome} instance. @param fps: A C{dict} of file pointers for the various output streams. @param featureNumber: The C{int} 0-based count of the features requested. This will be zero for the first feature, 1 for the second, etc. @param args: A C{Namespace} instance as returned by argparse with values for command-line options. """ referenceNt, genomeNt = genome.ntSequences(featureName) feature = genome.features[featureName] if args.printAaMatch or args.printAaSequence or args.printAaAlignment: try: referenceAa, genomeAa = genome.aaSequences(featureName) except TranslationError as e: if args.onError == 'raise': raise elif args.onError == 'print': print( f'Could not translate feature {featureName} in genome ' f'{genome.genome.id}: {e}', file=sys.stderr) referenceAa = genomeAa = None newlineNeeded = False if args.printNtMatch: fp = fps['nt-match'] if featureNumber: print(file=fp) print(f'Feature: {featureName} nucleotide match', file=fp) print(f' Reference nt location {feature["start"] + 1}', file=fp) match = compareDNAReads(referenceNt, genomeNt) print(dnaMatchToString(match, referenceNt, genomeNt, matchAmbiguous=False, indent=' '), file=fp) printDiffs(referenceNt, genomeNt, True, feature['start'], fp, indent=' ') newlineNeeded = True if args.printAaMatch and genomeAa: fp = fps['aa-match'] if newlineNeeded or featureNumber: print(file=fp) print(f'Feature: {featureName} amino acid match', file=fp) match = compareAaReads(referenceAa, genomeAa) print(aaMatchToString(match, referenceAa, genomeAa, indent=' '), file=fp) printDiffs(referenceAa, genomeAa, False, feature['start'], fp, indent=' ') if args.printNtSequence: noGaps = Read(genomeNt.id, genomeNt.sequence.replace('-', '')) Reads([noGaps]).save(fps['nt-sequence']) if args.printAaSequence and genomeAa: noGaps = Read(genomeAa.id, genomeAa.sequence.replace('-', '')) Reads([noGaps]).save(fps['aa-sequence']) if args.printNtAlignment: Reads([genomeNt, referenceNt]).save(fps['nt-align']) if args.printAaAlignment and genomeAa: Reads([genomeAa, referenceAa]).save(fps['aa-align'])