示例#1
0
    def test_variants_to_tsv_lines_noncoding(self):
        '''test _variants_to_tsv_lines noncoding sequences'''
        padded_seqs = {
            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'),
            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'),
            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTAT---AATTAG'),
            'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTGTTGTAATTAG'),
            'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'),
        }

        unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs)
        insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs)

        variant1 = sequence_variant.Variant('n', 'C5T', 'id1')
        variant2 = sequence_variant.Variant('n', 'A5T', 'id2')
        variants = {
            'seq1': [(variant1, 'description 1')],
            'seq5': [(variant2, 'description 2')],
        }

        expected = [
            'seq1\t0\t1\tC5T\tid1\tdescription 1',
            'seq2\t0\t1\tC5T\tid1\tdescription 1',
            'seq4\t0\t1\tG8T\tid1\tdescription 1',
            'seq5\t0\t1\tA8T\tid1\tdescription 1',
            'seq5\t0\t1\tA5T\tid2\tdescription 2',
            'seq3\t0\t1\tA5T\tid2\tdescription 2',
            'seq4\t0\t1\tG5T\tid2\tdescription 2',
        ]

        got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, False, True)
        self.assertEqual(expected, got)
示例#2
0
    def test_variants_to_tsv_lines_coding(self):
        '''test _variants_to_tsv_lines coding sequences'''
        padded_seqs = {
            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'), # M-AN*
            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'), # MFAN*
            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTTT---AATTAG'), # MF-N*
            'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTTTTGTAATTAG'), # MFCN*
            'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'), # MFDN*
        }

        unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs)
        insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs)

        variant1 = sequence_variant.Variant('p', 'A2D', 'id1')
        variant2 = sequence_variant.Variant('p', 'F2E', 'id2')
        variants = {
            'seq1': [(variant1, 'description 1')],
            'seq5': [(variant2, 'description 2')],
        }

        expected = [
            'seq1\t1\t0\tA2D\tid1\tdescription 1',
            'seq2\t1\t0\tA2D\tid1\tdescription 1',
            'seq4\t1\t0\tC3D\tid1\tdescription 1',
            'seq5\t1\t0\tA3D\tid1\tdescription 1',
            'seq5\t1\t0\tF2E\tid2\tdescription 2',
            'seq3\t1\t0\tF2E\tid2\tdescription 2',
            'seq4\t1\t0\tF2E\tid2\tdescription 2',
        ]

        got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, True, False)
        self.assertEqual(expected, got)
示例#3
0
    def test_variant_ids_are_unique(self):
        '''test variant_ids_are_unique'''
        variants = {
            'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')],
            'seq2': [(sequence_variant.Variant('p', 'L2M', 'id2'), 'description2')]
        }

        self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants))
        variants['seq2'].append((sequence_variant.Variant('p', 'I3K', 'id1'), 'description3'))
        with self.assertRaises(aln_to_metadata.Error):
            self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants))
示例#4
0
    def test_has_variant(self):
        '''test has_variant'''
        seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC*
        tests = [
            (sequence_variant.Variant('n', 'A2T', '.'), True),
            (sequence_variant.Variant('n', 'T2A', '.'), False),
            (sequence_variant.Variant('p', 'I2Y', '.'), True),
            (sequence_variant.Variant('p', 'Y2I', '.'), False),
        ]

        for var, expected in tests:
            self.assertEqual(expected, var.has_variant(seq))
示例#5
0
 def test_load_vars_file_good_file(self):
     '''test _load_vars_file good input file'''
     infile = os.path.join(data_dir, 'aln_to_metadata_load_vars_file_good.tsv')
     variant1 = sequence_variant.Variant('p', 'A42B', 'id1')
     variant2 = sequence_variant.Variant('p', 'C43D', 'id2')
     variant3 = sequence_variant.Variant('p', 'E100F', 'id3')
     expected = {
         'seq1': [(variant1, 'description 1')],
         'seq2': [(variant2, 'description 2'), (variant3, 'description 3')]
     }
     got = aln_to_metadata.AlnToMetadata._load_vars_file(infile, True)
     self.assertEqual(expected, got)
示例#6
0
    def test_init_str(self):
        '''Test init ok and str'''
        variants = ['I42K', 'i42k', 'I42k', 'i42K']
        expected = 'I42K'

        for var in variants:
            self.assertEqual(expected, str(sequence_variant.Variant('p', var, '.')))
示例#7
0
    def test_check_variants_match_sequences(self):
        '''test _check_variants_match_sequences'''
        seqs = {
            'seq1': pyfastaq.sequences.Fasta('seq1', 'ATGCTTTAG'),
            'seq2': pyfastaq.sequences.Fasta('seq2', 'ATGCTTCTTTAG'),
            'seq3': pyfastaq.sequences.Fasta('seq3', 'ATG---TAG')
        }

        variants = {'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')]}
        self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
        variants = {'seq1': [(sequence_variant.Variant('p', 'M2L', 'id1'), 'description1')]}
        self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))

        variants = {'seq1': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]}
        with self.assertRaises(aln_to_metadata.Error):
            self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))

        variants = {'seq4': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]}
        with self.assertRaises(aln_to_metadata.Error):
            self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
示例#8
0
    def test_sanity_check_against_seq_translate(self):
        '''test sanity_check_against_seq with translate True'''
        seq = 'AGTACGACGTAC'  # translates to STTY
        tests = [
            ('S1X', True),
            ('x1s', True),
            ('a1y', False),
            ('x5y', False)
        ]

        for var, expected in tests:
            variant = sequence_variant.Variant('p', var, '.')
            self.assertEqual(expected, variant.sanity_check_against_seq(seq, translate_seq=True))
示例#9
0
    def test_sanity_check_against_seq_no_translate(self):
        '''test sanity_check_against_seq with translate False'''
        seq = 'BrissSpecialStvff'
        tests = [
            ('I3K', True),
            ('K3I', True),
            ('A2b', False),
            ('x1000y', False)
        ]

        for var, expected in tests:
            variant = sequence_variant.Variant('p', var, '.')
            self.assertEqual(expected, variant.sanity_check_against_seq(seq))
示例#10
0
    def test_init_ok(self):
        '''Test init ok'''
        variants = [('I42K', '.'), ('i42k', 'id1'), ('I42k', 'id2'), ('i42K', 'id3')]

        for var, identifier in variants:
            aa_var = sequence_variant.Variant('p', var, identifier)
            self.assertEqual(41, aa_var.position)
            self.assertEqual('I', aa_var.wild_value)
            self.assertEqual('K', aa_var.variant_value)
            if identifier == '.':
                self.assertIsNone(aa_var.identifier)
            else:
                self.assertEqual(identifier, aa_var.identifier)
示例#11
0
    def test_init_fails_on_bad_variant_strings(self):
        '''Test init fails on bad variant strings'''
        bad_variants = [
            'x',
            'x1',
            '1x',
            '1x1',
            'I42K43',
            'I-1K',
        ]

        for var in bad_variants:
            with self.assertRaises(sequence_variant.Error):
                sequence_variant.Variant('p', var, '.')
示例#12
0
    def _get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variants_list):
        aa_var_effect, aa_var_string, aa_var_position = AssemblyVariants._get_variant_effect(mummer_variants_list, ref_sequence)
        var_tuple = None
        used_known_variants = set()

        # if this variant is at the same position as a known variant in the reference
        if refdata_var_dict is not None and aa_var_position in refdata_var_dict['p']:
            if aa_var_effect == 'NONSYN':
                aa_variant = sequence_variant.Variant('p', aa_var_string, '.')
                variants_at_this_position = {x for x in refdata_var_dict['p'][aa_variant.position]}
                matching_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.variant_value}
                not_interesting_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.wild_value}
                variants_at_this_position = variants_at_this_position.difference(matching_variants)
            else:
                matching_variants = set()
                variants_at_this_position = refdata_var_dict['p'][aa_var_position]
                not_interesting_variants = set()

            if len(not_interesting_variants) == 0:
                var_tuple = (
                    aa_var_position,
                    'p',
                    aa_var_string,
                    aa_var_effect,
                    mummer_variants_list,
                    matching_variants,
                    variants_at_this_position
                )

            used_known_variants.update(matching_variants, variants_at_this_position)
        else: # this variant is not at a known position in the reference
            var_tuple = (
                aa_var_position,
                'p',
                aa_var_string,
                aa_var_effect,
                mummer_variants_list,
                set(),
                set()
            )

        return var_tuple, used_known_variants
示例#13
0
    def _load_vars_file(cls, vars_file, refs_are_coding):
        var_type = 'p' if refs_are_coding else 'n'
        f = pyfastaq.utils.open_file_read(vars_file)
        variants = {}

        for line in f:
            try:
                ref_name, variant, identifier, description = line.rstrip().split('\t')
                variant = sequence_variant.Variant(var_type, variant, identifier)
            except:
                pyfastaq.utils.close(f)
                raise Error('Error in this line of variants file:\n' + line)

            if ref_name not in variants:
                variants[ref_name] = []

            variants[ref_name].append((variant, description))

        pyfastaq.utils.close(f)
        return variants
示例#14
0
    def __init__(self, line):
        try:
            self.name, seq_type, var_only, variant, variant_id, self.free_text = line.rstrip().split('\t')
        except:
            raise Error('Error parsing line of file:\n' + line)

        if seq_type not in {'0', '1'}:
            raise Error('Error. Second column must be "0" or "1". Cannot continue. Line was:\n' + line)

        self.seq_type = 'n' if seq_type == '0' else 'p'

        if var_only not in {'0', '1'}:
            raise Error('Error. Third column must be "0" or "1". Cannot continue. Line was:\n' + line)

        self.variant_only = var_only == '1'

        if variant == '.':
            self.variant = None
        else:
            self.variant = sequence_variant.Variant(self.seq_type, variant, variant_id)
示例#15
0
 def test_nucleotide_range(self):
     '''test nucleotide_range'''
     sv = sequence_variant.Variant('n', 'A2T', '.')
     self.assertEqual((1, 1), sv.nucleotide_range())
     sv = sequence_variant.Variant('p', 'I42L', '.')
     self.assertEqual((123, 125), sv.nucleotide_range())