Пример #1
0
    def testGapAndMatch(self):
        """
        Two sequences containing matches and gaps must compare as
        expected.
        """
        read1 = AARead('id1', 'GALHN-')
        read2 = AARead('id2', 'GALHNA')
        match = compareAaReads(read1, read2)

        self.assertEqual(
            '''\
Matches: 5/6 (83.33%)
Mismatches: 1/6 (16.67%)
  Not involving gaps (i.e., conflicts): 0
  Involving a gap in one sequence: 1/6 (16.67%)
  Involving a gap in both sequences: 0
  Id: id1
    Length: 6
    Gaps: 1/6 (16.67%)
    Gap locations (1-based): 6
  Id: id2
    Length: 6
    Gaps: 0''',
            matchToString(match, read1, read2)
        )
Пример #2
0
 def testOffsets(self):
     """
     If a set of wanted offsets is passed, the result must be restricted to
     just those offsets.
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 1,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareAaReads(AARead('id1', 'GAL-L'),
                        AARead('id2', 'G-LHN'),
                        offsets=set([0, 4])))
Пример #3
0
 def testNonDefaultGapChars(self):
     """
     We must be able to specify the gap characters.
     """
     for gap in '+$':
         self.assertEqual(
             {
                 'match': {
                     'matchCount': 3,
                     'gapMismatchCount': 2,
                     'gapGapMismatchCount': 0,
                     'nonGapMismatchCount': 0,
                 },
                 'read1': {
                     'extraCount': 0,
                     'gapOffsets': [2],
                 },
                 'read2': {
                     'extraCount': 0,
                     'gapOffsets': [0],
                 },
             },
             compareAaReads(AARead('id1', 'GA%sHN' % gap),
                            AARead('id2', '%sALHN' % gap),
                            gapChars='+$'))
Пример #4
0
def processFeature(featureName, features, genome, fps, featureNumber, args):
    """
    Process a feature from a genome.

    @param featureName: A C{str} feature name.
    @param features: A C{Features} instance.
    @param genome: A C{SARS2Genome} instance.
    @param fps: A C{dict} of file pointers for the various output streams.
    @param featureNumber: The C{int} 0-based count of the features requested.
        This will be zero for the first feature, 1 for the second, etc.
    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    """
    result = genome.feature(featureName)
    feature = features.getFeature(featureName)
    referenceNt, genomeNt = result.ntSequences()
    referenceAa, genomeAa = result.aaSequences()

    newlineNeeded = False

    if args.printNtMatch:
        fp = fps['nt-match']
        if featureNumber:
            print(file=fp)
        print(f'Feature: {featureName} nucleotide match', file=fp)
        print(f'  Reference nt location {feature["start"] + 1}, genome nt '
              f'location {result.genomeOffset + 1}', file=fp)
        match = compareDNAReads(referenceNt, genomeNt)
        print(dnaMatchToString(match, referenceNt, genomeNt,
                               matchAmbiguous=False, indent='  '), file=fp)
        printDiffs(referenceNt, genomeNt, True, feature['start'], fp,
                   indent='    ')
        newlineNeeded = True

    if args.printAaMatch:
        fp = fps['aa-match']
        if newlineNeeded or featureNumber:
            print(file=fp)
        print(f'Feature: {featureName} amino acid match', file=fp)
        match = compareAaReads(referenceAa, genomeAa)
        print(aaMatchToString(match, referenceAa, genomeAa, indent='  '),
              file=fp)
        printDiffs(referenceAa, genomeAa, False, feature['start'], fp,
                   indent='    ')

    if args.printNtSequence:
        noGaps = Read(genomeNt.id, genomeNt.sequence.replace('-', ''))
        Reads([noGaps]).save(fps['nt-sequence'])

    if args.printAaSequence:
        noGaps = Read(genomeAa.id, genomeAa.sequence.replace('-', ''))
        Reads([noGaps]).save(fps['aa-sequence'])

    if args.printNtAlignment:
        Reads([genomeNt, referenceNt]).save(fps['nt-align'])

    if args.printAaAlignment:
        Reads([genomeAa, referenceAa]).save(fps['aa-align'])
Пример #5
0
 def testGapInSecond(self):
     """
     A gap in the second sequence must be dealt with correctly.
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 3,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         }, compareAaReads(AARead('id1', 'GALHN'), AARead('id2', 'G--HN')))
Пример #6
0
 def testGapInFirst(self):
     """
     A gap in the first sequence must be dealt with correctly.
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 4,
                 'gapMismatchCount': 1,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [3],
             },
             'read2': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareAaReads(AARead('id1', 'GAL-N'), AARead('id2', 'GALHN')))
Пример #7
0
 def testExactMatch(self):
     """
     Two sequences that match exactly must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 5,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareAaReads(AARead('id1', 'GALHN'), AARead('id2', 'GALHN')))
Пример #8
0
 def testEmptySequences(self):
     """
     Two empty sequences must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareAaReads(AARead('id1', ''), AARead('id2', '')))
Пример #9
0
 def testGapGap(self):
     """
     Coinciding gaps in the sequences must be dealt with correctly
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 2,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 1,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [2, 3],
             },
             'read2': {
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         }, compareAaReads(AARead('id1', 'GA--N'), AARead('id2', 'G--HN')))
Пример #10
0
 def testMismatch(self):
     """
     If the sequences have mismatched bases, their count
     must be given correctly in the nonGapMismatchCount.
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 3,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 2,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareAaReads(AARead('id1', 'GALYY'), AARead('id2', 'GALHN')))
Пример #11
0
 def testNoOffsets(self):
     """
     If an empty set of wanted offsets is passed, the result must be empty.
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareAaReads(AARead('id1', 'GAL-N'),
                        AARead('id2', 'G-LHN'), offsets=set()))
Пример #12
0
 def testExtraInSecond(self):
     """
     If the second sequence has extra bases, they must be indicated in the
     extraCount.
     """
     self.assertEqual(
         {
             'match': {
                 'matchCount': 5,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'extraCount': 2,
                 'gapOffsets': [],
             },
         }, compareAaReads(AARead('id1', 'GALHN'), AARead('id2',
                                                          'GALHNHN')))
Пример #13
0
    if args.alignmentFile:
        assert reads.save(args.alignmentFile) == 2

offsets = (parseRangeString(args.sites, convertToZeroBased=True)
           if args.sites else None)

read1, read2 = reads
len1, len2 = map(len, reads)
identicalLengths = len1 == len2

# Sanity check.
if args.align:
    assert identicalLengths

match = compareAaReads(read1, read2, offsets=offsets)

x = 'Post-alignment, sequence' if args.align else 'Sequence'
if identicalLengths:
    print('%s lengths are identical: %s' % (x, len1))
else:
    print('%s lengths: %d, %d (difference %d)' % (x, len1, len2,
                                                  abs(len1 - len2)))

print(matchToString(match, read1, read2, offsets=offsets))

if args.showDiffs:
    # Print all sites where the sequences differ.
    width = int(log10(max(len1, len2))) + 1
    headerPrinted = False
    for site, (a, b) in enumerate(zip(read1.sequence, read2.sequence),
Пример #14
0
    if args.alignmentFile:
        assert reads.save(args.alignmentFile) == 2

offsets = (parseRangeString(args.sites, convertToZeroBased=True)
           if args.sites else None)

read1, read2 = reads
len1, len2 = map(len, reads)
identicalLengths = len1 == len2

# Sanity check.
if args.align:
    assert identicalLengths

match = compareAaReads(read1, read2, offsets=offsets)

x = 'Post-alignment, sequence' if args.align else 'Sequence'
if identicalLengths:
    print('%s lengths are identical: %s' % (x, len1))
else:
    print('%s lengths: %d, %d (difference %d)' %
          (x, len1, len2, abs(len1 - len2)))

print(matchToString(match, read1, read2, offsets=offsets))

if args.showDiffs:
    # Print all sites where the sequences differ.
    width = int(log10(max(len1, len2))) + 1
    headerPrinted = False
    for site, (a, b) in enumerate(zip(read1.sequence, read2.sequence),
Пример #15
0
def processFeature(featureName, genome, fps, featureNumber, args):
    """
    Process a feature from a genome.

    @param featureName: A C{str} feature name.
    @param genome: A C{SARS2Genome} instance.
    @param fps: A C{dict} of file pointers for the various output streams.
    @param featureNumber: The C{int} 0-based count of the features requested.
        This will be zero for the first feature, 1 for the second, etc.
    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    """
    referenceNt, genomeNt = genome.ntSequences(featureName)
    feature = genome.features[featureName]

    if args.printAaMatch or args.printAaSequence or args.printAaAlignment:
        try:
            referenceAa, genomeAa = genome.aaSequences(featureName)
        except TranslationError as e:
            if args.onError == 'raise':
                raise
            elif args.onError == 'print':
                print(
                    f'Could not translate feature {featureName} in genome '
                    f'{genome.genome.id}: {e}',
                    file=sys.stderr)
            referenceAa = genomeAa = None

    newlineNeeded = False

    if args.printNtMatch:
        fp = fps['nt-match']
        if featureNumber:
            print(file=fp)
        print(f'Feature: {featureName} nucleotide match', file=fp)
        print(f'  Reference nt location {feature["start"] + 1}', file=fp)
        match = compareDNAReads(referenceNt, genomeNt)
        print(dnaMatchToString(match,
                               referenceNt,
                               genomeNt,
                               matchAmbiguous=False,
                               indent='  '),
              file=fp)
        printDiffs(referenceNt,
                   genomeNt,
                   True,
                   feature['start'],
                   fp,
                   indent='    ')
        newlineNeeded = True

    if args.printAaMatch and genomeAa:
        fp = fps['aa-match']
        if newlineNeeded or featureNumber:
            print(file=fp)
        print(f'Feature: {featureName} amino acid match', file=fp)
        match = compareAaReads(referenceAa, genomeAa)
        print(aaMatchToString(match, referenceAa, genomeAa, indent='  '),
              file=fp)
        printDiffs(referenceAa,
                   genomeAa,
                   False,
                   feature['start'],
                   fp,
                   indent='    ')

    if args.printNtSequence:
        noGaps = Read(genomeNt.id, genomeNt.sequence.replace('-', ''))
        Reads([noGaps]).save(fps['nt-sequence'])

    if args.printAaSequence and genomeAa:
        noGaps = Read(genomeAa.id, genomeAa.sequence.replace('-', ''))
        Reads([noGaps]).save(fps['aa-sequence'])

    if args.printNtAlignment:
        Reads([genomeNt, referenceNt]).save(fps['nt-align'])

    if args.printAaAlignment and genomeAa:
        Reads([genomeAa, referenceAa]).save(fps['aa-align'])