Exemplo n.º 1
    def testAaSequencesChangesTranslationErrorIgnore(self):
        Check that no error is printed when checking AA sequences and
        onError='ignore' and that the expected result is returned.
        features = Features(
                'orf1ab': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()

        testCount, errorCount, result = genome.checkFeature(
            'orf1ab', 'A100000A', nt=False, onError='ignore', errFp=err)
        self.assertEqual('', err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Exemplo n.º 2
    def testNtSequencesChangesIndexErrorIgnore(self):
        If we check on nucleotide sequences with an out-of-range
        check, no error should be printed if we pass onError='ignore'
        and the expected error result must be returned.
        features = Features(
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A100000A', nt=True, onError='ignore', errFp=err)
        self.assertEqual('', err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Exemplo n.º 3
    def testNtSequencesChangesString(self):
        It must be possible to retrieve aligned nucleotide sequences
        and check on changes using a string specification.
        features = Features(
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        # Note: 1-based locations.
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A1A T2A A3T T4T', True)

        self.assertEqual(4, testCount)
        self.assertEqual(3, errorCount)
        self.assertEqual((True, 'A', True, 'A'), result['A1A'])
        self.assertEqual((True, 'T', False, 'T'), result['T2A'])
        self.assertEqual((False, 'T', True, 'T'), result['A3T'])
        self.assertEqual((False, 'C', False, 'C'), result['T4T'])
Exemplo n.º 4
 def testPassingRefence(self):
     It must be possible to pass a reference
     reference = DNARead('refId', 'ATTC')
     features = Features({}, reference)
     self.assertIs(reference, features.reference)
Exemplo n.º 5
    def testNtSequencesChangesTuple(self):
        It must be possible to retrieve aligned nucleotide sequences
        and check on changes using a tuple specification.
        features = Features(
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        # Note: 0-based offsets.
        testCount, errorCount, result = genome.checkFeature(
            (('A', 0, 'A'), ('T', 1, 'A'), ('A', 2, 'T'), ('T', 3, 'T')), True)

        self.assertEqual(4, testCount)
        self.assertEqual(3, errorCount)
        self.assertEqual((True, 'A', True, 'A'), result[('A', 0, 'A')])
        self.assertEqual((True, 'T', False, 'T'), result[('T', 1, 'A')])
        self.assertEqual((False, 'T', True, 'T'), result[('A', 2, 'T')])
        self.assertEqual((False, 'C', False, 'C'), result[('T', 3, 'T')])
Exemplo n.º 6
 def testKnownFeatures(self):
     Only known feature names are allowed.
     features = Features()
     for variant in VARIANTS:
         for featureName in VARIANTS[variant]['changes']:
             self.assertIsInstance(features[featureName], dict)
Exemplo n.º 7
def main(args):
    Describe SARS-CoV-2 annotations.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    features = Features(args.gbFile)

    print(f'Features for {features.reference.id}:')

    for featureName, feature in sorted(features.items()):
        print('  start:', feature['start'])
        print('  stop:', feature['stop'])
        print('  length:', feature['stop'] - feature['start'])
            print('  product:', feature['product'])
        except KeyError:
            print('  function:', feature['function'])
        except KeyError:

        sequence = feature['sequence']
        print(f'  sequence    (len {len(sequence):5d} nt):',
              (sequence[:args.maxLen] +
               '...') if len(sequence) > args.maxLen else sequence)

            translation = feature['translation']
        except KeyError:
            # Some features (e.g., UTR, stem loops) do not have a translation.
            print(f'  translation (len {len(translation):5d} aa):',
                  (translation[:args.maxLen] +
                   '...') if len(translation) > args.maxLen else translation)
Exemplo n.º 8
    def testPassingDict(self):
        It must be possible to initialize a Features instance via a dict.
        value = {
            'name': 'spike',
            'sequence': 'ATTC',
            'start': 0,
            'stop': 4,
        features = Features({'spike': value})

        self.assertIn('spike', features)
        self.assertEqual(value, features['spike'])
Exemplo n.º 9
    def testNtSequencesGenomeGap(self):
        The genome must be able to have a gap relative to the reference.
        referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG'
        genomeFeature = 'TGGA' + ('T' * 19) + 'CAAATCGG'
        genomeSequence = 'CCCGGTGGCG' + genomeFeature + 'CCCCCCC'

        features = Features(
                'spike': {
                    'name': 'spike',
                    'sequence': referenceSequence,
                    'start': 5,
                    'stop': len(referenceSequence),
            DNARead('refId', referenceSequence))

        genome = SARS2Genome(DNARead('genId', genomeSequence), features)

        # The genome offset is initialized to None and isn't set until
        # after ntSequences is called.
        # self.assertEqual(None, alignment.genomeOffset)

        referenceNt, genomeNt = genome.ntSequences('spike')

        # self.assertEqual(5, alignment.genomeOffset)

        self.assertEqual(referenceSequence[5:], referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)

        expected = 'TGGA-' + ('T' * 19) + 'CAAATCGG'
        self.assertEqual(expected, genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        testCount, errorCount, result = genome.checkFeature(
            'spike', 'T5-', True)

        self.assertEqual(1, testCount)
        self.assertEqual(0, errorCount)
        self.assertEqual((True, 'T', True, '-'), result['T5-'])
Exemplo n.º 10
    def testAaSequencesTranslationNoSlipperySequenceRaise(self):
        The aaSequences function must raise if it can't translate an
        'ORF1ab polyprotein' sequence due to a missing slippery sequence.
        features = Features(
                'ORF1ab polyprotein': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = r'^No slippery sequence found\.$'
        self.assertRaisesRegex(NoSlipperySequenceError, error,
                               genome.aaSequences, 'ORF1ab polyprotein')
Exemplo n.º 11
    def testNtSequencesChangesIndexErrorPrint(self):
        If we check on nucleotide sequences with an out-of-range
        check, an error must be printed if we pass onError='print'
        and the expected error result must be returned.
        features = Features(
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()

        # Two lines of error output are printed.
        error = (
            r"Index 99999 out of range trying to access feature "
            r"'spike' of length 4 sequence 'refId (spike)' via "
            r"expected change specification 'A100000A'."
            r"Index 99999 out of range trying to access feature "
            r"'spike' of length 4 sequence 'genId (spike)' via "
            r"expected change specification 'A100000A'."
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A100000A', nt=True, onError='print', errFp=err)
        self.assertEqual(error, err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Exemplo n.º 12
    def testAaSequencesChangesTranslationErrorRaise(self):
        Check that a TranslationError is raised when checking AA
        features = Features(
                'orf1ab': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = r"^No slippery sequence found\.$"
            NoSlipperySequenceError, error, genome.checkFeature,
            'orf1ab', 'A100000A', False)
Exemplo n.º 13
    def testNtSequencesChangesIndexErrorRaise(self):
        If we check on nucleotide sequences with an out-of-range
        check, an IndexError must be raised.
        features = Features(
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = (r"^Index 99999 out of range trying to access feature "
                 r"'spike' of length 4 sequence 'refId \(spike\)' via "
                 r"expected change specification 'A100000A'\.$")
        self.assertRaisesRegex(IndexError, error, genome.checkFeature,
                               'spike', 'A100000A', True)
Exemplo n.º 14
    def testNtSequencesGenomeSNP(self):
        The genome must be able to have a SNP relative to the reference.
        referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG'
        genomeFeature = 'TGGCGTGGA' + ('T' * 9) + 'A' + ('T' * 10) + 'CAAATCGG'
        genomeSequence = 'CCCGG' + genomeFeature + 'CCCCCCC'

        features = Features(
                'spike': {
                    'name': 'spike',
                    'sequence': referenceSequence,
                    'start': 0,
                    'stop': len(referenceSequence),
            DNARead('refId', referenceSequence))

        genome = SARS2Genome(DNARead('genId', genomeSequence), features)

        referenceNt, genomeNt = genome.ntSequences('spike')

        expected = 'TGGCGTGGA' + ('T' * 9) + 'A' + ('T' * 10) + 'CAAATCGG'
        self.assertEqual(expected, genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        self.assertEqual(referenceSequence, referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)

        testCount, errorCount, result = genome.checkFeature(
            'spike', 'T19A', True)

        self.assertEqual(1, testCount)
        self.assertEqual(0, errorCount)
        self.assertEqual((True, 'T', True, 'A'), result['T19A'])
Exemplo n.º 15
    def testNtSequences(self):
        It must be possible to retrieve aligned nucleotide sequences.
        features = Features(
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        referenceNt, genomeNt = genome.ntSequences('spike')

        self.assertEqual('ATTC', genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        self.assertEqual('ATTC', referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)
Exemplo n.º 16
from unittest import TestCase

from dark.reads import DNARead

from sars2seq.features import Features

_FEATURES = Features()

class TestFeatures(TestCase):
    Test the Features class.
    def testGetFeatures(self):
        The getitem method must return a dict.
        self.assertIsInstance(_FEATURES['spike'], dict)

    def testUnknownFeature(self):
        If an unknown feature is asked for, a KeyError must be raised.
        self.assertRaisesRegex(KeyError, "^'xx'$", _FEATURES.__getitem__, 'xx')

    def testPassingDict(self):
        It must be possible to initialize a Features instance via a dict.
        value = {
            'name': 'spike',
Exemplo n.º 17
def main(args):
    Describe a SARS-CoV-2 genome.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    outDir = args.outDir
    if outDir:
        if not exists(outDir):

    features = Features(args.gbFile)

    if args.feature:
        if args.canonicalNames:
            wantedFeatures = map(features.canonicalName, args.feature)
            wantedFeatures = args.feature
        if args.noFeatures:
            wantedFeatures = []
            wantedFeatures = sorted(features)

    namedMatches = defaultdict(list)
    foundSets = defaultdict(list)

    reads = list(FastaReads(args.genome))

    maxLen = 0
    nameSummary = []
    for read in reads:
        shortId = read.id.split()[0]
        if len(shortId) > maxLen:
            maxLen = len(shortId)
        nameSummary.append((shortId, read.id))
        read.id = shortId

    for shortId, longId in nameSummary:
        print(f'{shortId:{maxLen}s} = {longId}')

    print('\nPER-SEQUENCE RESULTS\n')

    for read in reads:
        genome = SARS2Genome(read, features)

        if args.checkVariant:
            with genomeFilePointer(read, args, '-variant-summary.txt') as fp:
                nCount = genome.genome.sequence.count('N')
                genomeLen = len(genome.genome)
                nonNCount = genomeLen - nCount
                coverage = nonNCount / genomeLen
                print(f'{read.id} (coverage {nonNCount}/{genomeLen} = '
                      f'{coverage * 100.0:.2f} %)', file=fp)

                theseNamedMatches, theseFoundSets = printVariantSummary(
                    genome, fp, args)

                for match, ids in theseNamedMatches.items():

                for match, ids in theseFoundSets.items():


        for i, featureName in enumerate(wantedFeatures):
            with featureFilePointers(read, featureName, args) as fps:
                processFeature(featureName, features, genome, fps, i, args)


    if namedMatches:
        print('Named change sets:')
        for changeSet in sorted(CHANGE_SETS):
            desc = ', '.join(sorted(CHANGE_SETS[changeSet], key=key))
            print(f'  {changeSet}: {desc}')

        print('Known variant combinations matched (count):')
        for match in sorted(namedMatches):
            print(f'  {match} ({len(namedMatches[match])}):')
            for name in sorted(namedMatches[match]):
                print(f'    {name}')
        if foundSets:

    if foundSets:
        print('Sets of changes found (count):')
        for match in sorted(foundSets):
            desc = ', '.join(sorted(match, key=key))
            print(f'  {desc} ({len(foundSets[match])}):')
            for name in sorted(foundSets[match]):
                print(f'    {name}')
Exemplo n.º 18
def main(args):
    Describe a SARS-CoV-2 genome.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    @return: An C{int} exit status.
    outDir = args.outDir
    if outDir:
        if not exists(outDir):

    features = Features(args.gbFile)

    if args.feature:
        if args.canonicalNames:
            wantedFeatures = map(features.canonicalName, args.feature)
            wantedFeatures = args.feature
        if args.noFeatures:
            wantedFeatures = []
            wantedFeatures = sorted(features)

    if not (args.checkVariant or wantedFeatures):
        print('No action specified - I have nothing to do!', file=sys.stderr)
        return 1

    count = ignoredDueToCoverageCount = 0

    for count, read in enumerate(FastaReads(args.genome), start=1):
        if args.minReferenceCoverage is not None:
            coverage = ((len(read) - read.sequence.upper().count('N')) /
            if coverage < args.minReferenceCoverage:
                ignoredDueToCoverageCount += 1
                    f'Genome {read.id!r} ignored due to low '
                    f'({coverage * 100.0:.2f}%) coverage of the reference.',

        genome = SARS2Genome(read, features)

        if args.checkVariant:
            with genomeFilePointer(read, args, '-variant-summary.txt') as fp:
                print(read.id, file=fp)
                printVariantSummary(genome, fp, args)

        for i, featureName in enumerate(wantedFeatures):
            with featureFilePointers(read, featureName, args) as fps:
                processFeature(featureName, genome, fps, i, args)

    print(f'Examined {count} genomes.')

    if args.minReferenceCoverage is not None:
        print(f'Ignored {ignoredDueToCoverageCount} genomes due to low '

    return 0
Exemplo n.º 19
from unittest import TestCase

from os.path import dirname, join

from .fasta import getSequence

import sars2seq
from sars2seq.checker import Checker, AAChecker, NTChecker
from sars2seq.features import Features
from sars2seq.genome import SARS2Genome

DATA_DIR = join(dirname(dirname(sars2seq.__file__)), 'data')
REF_GB = join(DATA_DIR, 'NC_045512.2.gb')

class Test_EPI_ISL_601443(TestCase):
    Test the EPI_ISL_601433 sequence. This is the variant of concern
    (VOC 202012/01) referred to in https://www.gov.uk/government/publications/
    genomeRead = getSequence(join(DATA_DIR, 'EPI_ISL_601443.fasta'))
    genome = SARS2Genome(genomeRead, FEATURES)

    def testIndexError(self):
        If an check on a non-existent index is attempted, an IndexError must
        be raised.
        checker = Checker('spike', 'N500001Y', False)