Exemplo n.º 1
0
    def testAaSequencesChangesTranslationErrorIgnore(self):
        """
        Check that no error is printed when checking AA sequences and
        onError='ignore' and that the expected result is returned.
        """
        features = Features(
            {
                'orf1ab': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()

        testCount, errorCount, result = genome.checkFeature(
            'orf1ab', 'A100000A', nt=False, onError='ignore', errFp=err)
        self.assertEqual('', err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Exemplo n.º 2
0
    def testNtSequencesChangesIndexErrorIgnore(self):
        """
        If we check on nucleotide sequences with an out-of-range
        check, no error should be printed if we pass onError='ignore'
        and the expected error result must be returned.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A100000A', nt=True, onError='ignore', errFp=err)
        self.assertEqual('', err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Exemplo n.º 3
0
    def testNtSequencesChangesString(self):
        """
        It must be possible to retrieve aligned nucleotide sequences
        and check on changes using a string specification.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        # Note: 1-based locations.
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A1A T2A A3T T4T', True)

        self.assertEqual(4, testCount)
        self.assertEqual(3, errorCount)
        self.assertEqual((True, 'A', True, 'A'), result['A1A'])
        self.assertEqual((True, 'T', False, 'T'), result['T2A'])
        self.assertEqual((False, 'T', True, 'T'), result['A3T'])
        self.assertEqual((False, 'C', False, 'C'), result['T4T'])
Exemplo n.º 4
0
 def testPassingRefence(self):
     """
     It must be possible to pass a reference
     """
     reference = DNARead('refId', 'ATTC')
     features = Features({}, reference)
     self.assertIs(reference, features.reference)
Exemplo n.º 5
0
    def testNtSequencesChangesTuple(self):
        """
        It must be possible to retrieve aligned nucleotide sequences
        and check on changes using a tuple specification.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        # Note: 0-based offsets.
        testCount, errorCount, result = genome.checkFeature(
            'spike',
            (('A', 0, 'A'), ('T', 1, 'A'), ('A', 2, 'T'), ('T', 3, 'T')), True)

        self.assertEqual(4, testCount)
        self.assertEqual(3, errorCount)
        self.assertEqual((True, 'A', True, 'A'), result[('A', 0, 'A')])
        self.assertEqual((True, 'T', False, 'T'), result[('T', 1, 'A')])
        self.assertEqual((False, 'T', True, 'T'), result[('A', 2, 'T')])
        self.assertEqual((False, 'C', False, 'C'), result[('T', 3, 'T')])
Exemplo n.º 6
0
 def testKnownFeatures(self):
     """
     Only known feature names are allowed.
     """
     features = Features()
     for variant in VARIANTS:
         for featureName in VARIANTS[variant]['changes']:
             self.assertIsInstance(features[featureName], dict)
Exemplo n.º 7
0
def main(args):
    """
    Describe SARS-CoV-2 annotations.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    """
    features = Features(args.gbFile)

    print(f'Features for {features.reference.id}:')

    for featureName, feature in sorted(features.items()):
        print(f'{featureName}:')
        print('  start:', feature['start'])
        print('  stop:', feature['stop'])
        print('  length:', feature['stop'] - feature['start'])
        try:
            print('  product:', feature['product'])
        except KeyError:
            pass
        try:
            print('  function:', feature['function'])
        except KeyError:
            pass

        sequence = feature['sequence']
        print(f'  sequence    (len {len(sequence):5d} nt):',
              (sequence[:args.maxLen] +
               '...') if len(sequence) > args.maxLen else sequence)

        try:
            translation = feature['translation']
        except KeyError:
            # Some features (e.g., UTR, stem loops) do not have a translation.
            pass
        else:
            print(f'  translation (len {len(translation):5d} aa):',
                  (translation[:args.maxLen] +
                   '...') if len(translation) > args.maxLen else translation)
Exemplo n.º 8
0
    def testPassingDict(self):
        """
        It must be possible to initialize a Features instance via a dict.
        """
        value = {
            'name': 'spike',
            'sequence': 'ATTC',
            'start': 0,
            'stop': 4,
        }
        features = Features({'spike': value})

        self.assertIn('spike', features)
        self.assertEqual(value, features['spike'])
Exemplo n.º 9
0
    def testNtSequencesGenomeGap(self):
        """
        The genome must be able to have a gap relative to the reference.
        """
        referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG'
        genomeFeature = 'TGGA' + ('T' * 19) + 'CAAATCGG'
        genomeSequence = 'CCCGGTGGCG' + genomeFeature + 'CCCCCCC'

        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': referenceSequence,
                    'start': 5,
                    'stop': len(referenceSequence),
                },
            },
            DNARead('refId', referenceSequence))

        genome = SARS2Genome(DNARead('genId', genomeSequence), features)

        # The genome offset is initialized to None and isn't set until
        # after ntSequences is called.
        # self.assertEqual(None, alignment.genomeOffset)

        referenceNt, genomeNt = genome.ntSequences('spike')

        # self.assertEqual(5, alignment.genomeOffset)

        self.assertEqual(referenceSequence[5:], referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)

        expected = 'TGGA-' + ('T' * 19) + 'CAAATCGG'
        self.assertEqual(expected, genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        testCount, errorCount, result = genome.checkFeature(
            'spike', 'T5-', True)

        self.assertEqual(1, testCount)
        self.assertEqual(0, errorCount)
        self.assertEqual((True, 'T', True, '-'), result['T5-'])
Exemplo n.º 10
0
    def testAaSequencesTranslationNoSlipperySequenceRaise(self):
        """
        The aaSequences function must raise if it can't translate an
        'ORF1ab polyprotein' sequence due to a missing slippery sequence.
        """
        features = Features(
            {
                'ORF1ab polyprotein': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = r'^No slippery sequence found\.$'
        self.assertRaisesRegex(NoSlipperySequenceError, error,
                               genome.aaSequences, 'ORF1ab polyprotein')
Exemplo n.º 11
0
    def testNtSequencesChangesIndexErrorPrint(self):
        """
        If we check on nucleotide sequences with an out-of-range
        check, an error must be printed if we pass onError='print'
        and the expected error result must be returned.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()

        # Two lines of error output are printed.
        error = (
            r"Index 99999 out of range trying to access feature "
            r"'spike' of length 4 sequence 'refId (spike)' via "
            r"expected change specification 'A100000A'."
            "\n"
            r"Index 99999 out of range trying to access feature "
            r"'spike' of length 4 sequence 'genId (spike)' via "
            r"expected change specification 'A100000A'."
            "\n"
        )
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A100000A', nt=True, onError='print', errFp=err)
        self.assertEqual(error, err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Exemplo n.º 12
0
    def testAaSequencesChangesTranslationErrorRaise(self):
        """
        Check that a TranslationError is raised when checking AA
        sequences.
        """
        features = Features(
            {
                'orf1ab': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = r"^No slippery sequence found\.$"
        self.assertRaisesRegex(
            NoSlipperySequenceError, error, genome.checkFeature,
            'orf1ab', 'A100000A', False)
Exemplo n.º 13
0
    def testNtSequencesChangesIndexErrorRaise(self):
        """
        If we check on nucleotide sequences with an out-of-range
        check, an IndexError must be raised.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = (r"^Index 99999 out of range trying to access feature "
                 r"'spike' of length 4 sequence 'refId \(spike\)' via "
                 r"expected change specification 'A100000A'\.$")
        self.assertRaisesRegex(IndexError, error, genome.checkFeature,
                               'spike', 'A100000A', True)
Exemplo n.º 14
0
    def testNtSequencesGenomeSNP(self):
        """
        The genome must be able to have a SNP relative to the reference.
        """
        referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG'
        genomeFeature = 'TGGCGTGGA' + ('T' * 9) + 'A' + ('T' * 10) + 'CAAATCGG'
        genomeSequence = 'CCCGG' + genomeFeature + 'CCCCCCC'

        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': referenceSequence,
                    'start': 0,
                    'stop': len(referenceSequence),
                },
            },
            DNARead('refId', referenceSequence))

        genome = SARS2Genome(DNARead('genId', genomeSequence), features)

        referenceNt, genomeNt = genome.ntSequences('spike')

        expected = 'TGGCGTGGA' + ('T' * 9) + 'A' + ('T' * 10) + 'CAAATCGG'
        self.assertEqual(expected, genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        self.assertEqual(referenceSequence, referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)

        testCount, errorCount, result = genome.checkFeature(
            'spike', 'T19A', True)

        self.assertEqual(1, testCount)
        self.assertEqual(0, errorCount)
        self.assertEqual((True, 'T', True, 'A'), result['T19A'])
Exemplo n.º 15
0
    def testNtSequences(self):
        """
        It must be possible to retrieve aligned nucleotide sequences.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        referenceNt, genomeNt = genome.ntSequences('spike')

        self.assertEqual('ATTC', genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        self.assertEqual('ATTC', referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)
Exemplo n.º 16
0
from unittest import TestCase

from dark.reads import DNARead

from sars2seq.features import Features

_FEATURES = Features()


class TestFeatures(TestCase):
    """
    Test the Features class.
    """
    def testGetFeatures(self):
        """
        The getitem method must return a dict.
        """
        self.assertIsInstance(_FEATURES['spike'], dict)

    def testUnknownFeature(self):
        """
        If an unknown feature is asked for, a KeyError must be raised.
        """
        self.assertRaisesRegex(KeyError, "^'xx'$", _FEATURES.__getitem__, 'xx')

    def testPassingDict(self):
        """
        It must be possible to initialize a Features instance via a dict.
        """
        value = {
            'name': 'spike',
Exemplo n.º 17
0
def main(args):
    """
    Describe a SARS-CoV-2 genome.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    """
    outDir = args.outDir
    if outDir:
        if not exists(outDir):
            os.makedirs(outDir)

    features = Features(args.gbFile)

    if args.feature:
        if args.canonicalNames:
            wantedFeatures = map(features.canonicalName, args.feature)
        else:
            wantedFeatures = args.feature
    else:
        if args.noFeatures:
            wantedFeatures = []
        else:
            wantedFeatures = sorted(features)

    namedMatches = defaultdict(list)
    foundSets = defaultdict(list)

    reads = list(FastaReads(args.genome))

    print('SEQUENCE SHORT NAMES\n')
    maxLen = 0
    nameSummary = []
    for read in reads:
        shortId = read.id.split()[0]
        if len(shortId) > maxLen:
            maxLen = len(shortId)
        nameSummary.append((shortId, read.id))
        read.id = shortId

    for shortId, longId in nameSummary:
        print(f'{shortId:{maxLen}s} = {longId}')

    print('\nPER-SEQUENCE RESULTS\n')

    for read in reads:
        genome = SARS2Genome(read, features)

        if args.checkVariant:
            with genomeFilePointer(read, args, '-variant-summary.txt') as fp:
                nCount = genome.genome.sequence.count('N')
                genomeLen = len(genome.genome)
                nonNCount = genomeLen - nCount
                coverage = nonNCount / genomeLen
                print(f'{read.id} (coverage {nonNCount}/{genomeLen} = '
                      f'{coverage * 100.0:.2f} %)', file=fp)

                theseNamedMatches, theseFoundSets = printVariantSummary(
                    genome, fp, args)

                for match, ids in theseNamedMatches.items():
                    namedMatches[match].extend(ids)

                for match, ids in theseFoundSets.items():
                    foundSets[match].extend(ids)

                print(file=fp)

        for i, featureName in enumerate(wantedFeatures):
            with featureFilePointers(read, featureName, args) as fps:
                processFeature(featureName, features, genome, fps, i, args)

    print('\nSUMMARY\n')

    if namedMatches:
        print('Named change sets:')
        for changeSet in sorted(CHANGE_SETS):
            desc = ', '.join(sorted(CHANGE_SETS[changeSet], key=key))
            print(f'  {changeSet}: {desc}')
        print()

        print('Known variant combinations matched (count):')
        for match in sorted(namedMatches):
            print(f'  {match} ({len(namedMatches[match])}):')
            for name in sorted(namedMatches[match]):
                print(f'    {name}')
        if foundSets:
            print()

    if foundSets:
        print('Sets of changes found (count):')
        for match in sorted(foundSets):
            desc = ', '.join(sorted(match, key=key))
            print(f'  {desc} ({len(foundSets[match])}):')
            for name in sorted(foundSets[match]):
                print(f'    {name}')
Exemplo n.º 18
0
def main(args):
    """
    Describe a SARS-CoV-2 genome.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    @return: An C{int} exit status.
    """
    outDir = args.outDir
    if outDir:
        if not exists(outDir):
            os.makedirs(outDir)

    features = Features(args.gbFile)

    if args.feature:
        if args.canonicalNames:
            wantedFeatures = map(features.canonicalName, args.feature)
        else:
            wantedFeatures = args.feature
    else:
        if args.noFeatures:
            wantedFeatures = []
        else:
            wantedFeatures = sorted(features)

    if not (args.checkVariant or wantedFeatures):
        print('No action specified - I have nothing to do!', file=sys.stderr)
        return 1

    count = ignoredDueToCoverageCount = 0

    for count, read in enumerate(FastaReads(args.genome), start=1):
        if args.minReferenceCoverage is not None:
            coverage = ((len(read) - read.sequence.upper().count('N')) /
                        len(features.reference))
            if coverage < args.minReferenceCoverage:
                ignoredDueToCoverageCount += 1
                print(
                    f'Genome {read.id!r} ignored due to low '
                    f'({coverage * 100.0:.2f}%) coverage of the reference.',
                    file=sys.stderr)
                continue

        genome = SARS2Genome(read, features)

        if args.checkVariant:
            with genomeFilePointer(read, args, '-variant-summary.txt') as fp:
                print(read.id, file=fp)
                printVariantSummary(genome, fp, args)

        for i, featureName in enumerate(wantedFeatures):
            with featureFilePointers(read, featureName, args) as fps:
                processFeature(featureName, genome, fps, i, args)

    print(f'Examined {count} genomes.')

    if args.minReferenceCoverage is not None:
        print(f'Ignored {ignoredDueToCoverageCount} genomes due to low '
              f'coverage.')

    return 0
Exemplo n.º 19
0
from unittest import TestCase

from os.path import dirname, join

from .fasta import getSequence

import sars2seq
from sars2seq.checker import Checker, AAChecker, NTChecker
from sars2seq.features import Features
from sars2seq.genome import SARS2Genome

DATA_DIR = join(dirname(dirname(sars2seq.__file__)), 'data')
REF_GB = join(DATA_DIR, 'NC_045512.2.gb')
FEATURES = Features(REF_GB)


class Test_EPI_ISL_601443(TestCase):
    """
    Test the EPI_ISL_601433 sequence. This is the variant of concern
    (VOC 202012/01) referred to in https://www.gov.uk/government/publications/
    investigation-of-novel-sars-cov-2-variant-variant-of-concern-20201201
    """
    genomeRead = getSequence(join(DATA_DIR, 'EPI_ISL_601443.fasta'))
    genome = SARS2Genome(genomeRead, FEATURES)

    def testIndexError(self):
        """
        If an check on a non-existent index is attempted, an IndexError must
        be raised.
        """
        checker = Checker('spike', 'N500001Y', False)