Пример #1
0
    def test_parse_gene_symbols_multi_alts_multi_symbols(self):
        ''' check parse_gene_symbols() when we have multiple symbols per allele
        '''

        info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
            'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
        alts = ('G', 'C')

        self.assertEqual(info.parse_gene_symbols(alts, []), [
            Symbols(info={
                'HGNC_ID': 'D|X',
                'HGNC': 'D|X',
                'SYMBOL': 'D|X',
                'ENSG': 'D|X',
                'ENST': 'D|X',
                'ENSP': 'D|X',
                'ENSR': 'D|X'
            },
                    idx=0),
            Symbols(info={
                'HGNC_ID': 'E|Y',
                'HGNC': 'E|Y',
                'SYMBOL': 'E|Y',
                'ENSG': 'E|Y',
                'ENST': 'E|Y',
                'ENSP': 'E|Y',
                'ENSR': 'E|Y'
            },
                    idx=0)
        ])
 def test_second_allele(self):
     ''' test that we can set gene symbols from the second allele
     '''
     info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
     symbols = Symbols(info, 1)
     
     self.assertEqual(symbols.get('C'), '3')
     
     # test that if a symbol is missing, we skip to an alternate
     self.assertEqual(symbols.get('D'), 'D')
     
     # check that we can't retrieve anything for a symbol in the other allele
     with self.assertRaises(KeyError):
         symbols.get('A')
Пример #3
0
    def test_get_genes(self):
        """ test that get_genes() works correctly
        """

        self.var.info.symbols = [Symbols(info={}, idx=0)]
        self.assertEqual(self.var.info.get_genes(), [[]])

        self.var.info.symbols = [Symbols(info={'HGNC': 'TEST'}, idx=0)]
        self.assertEqual(self.var.info.get_genes(), [["TEST"]])

        self.var.info.symbols = [Symbols(info={'HGNC': 'TEST1|TEST2'}, idx=0)]
        self.assertEqual(self.var.info.get_genes(), [["TEST1", "TEST2"]])

        self.var.info.symbols = [Symbols(info={'HGNC': '.'}, idx=0)]
        self.assertEqual(self.var.info.get_genes(), [[None]])
Пример #4
0
    def test_fix_gene_IDs(self):
        """ test that fix_gene_IDs() works correctly
        """

        self.var.known_genes = {
            "TEST": {
                "start": 1000,
                "end": 2000,
                "chrom": "5"
            }
        }

        # make a CNV that will overlap with the known gene set
        self.var.info.symbols = [Symbols(info={'HGNC_ID': 'TEST'}, idx=0)]
        self.var.position = 1000
        self.var.info["END"] = "1500"

        # check that fixing gene names does not alter anything for a CNV in a
        # single known gene
        self.var.fix_gene_IDs()
        self.assertEqual(self.var.info.get_genes(), [['TEST']])

        # check that fixing gene names does not alter names not in the gene dict
        self.var.info.symbols = [
            Symbols(info={'HGNC_ID': 'TEST|TEST2'}, idx=0)
        ]
        self.var.fix_gene_IDs()
        self.assertEqual(self.var.info.get_genes(), [['TEST', 'TEST2']])

        # check that fixing gene names drop name of genes where the name is in
        # the known genes dict, and the CNV and gene do not overlap
        self.var.position = 900
        self.var.info["END"] = "950"
        self.var.fix_gene_IDs()
        self.assertEqual(self.var.info.get_genes(), [[None, 'TEST2']])

        # check that when we do not have any known genes, the gene names are
        # unaltered
        self.var.info.symbols = [
            Symbols(info={'HGNC_ID': 'TEST|TEST2'}, idx=0)
        ]
        self.var.known_genes = None
        self.var.fix_gene_IDs()
        self.assertEqual(self.var.info.get_genes(), [['TEST', 'TEST2']])
Пример #5
0
    def test_set_gene_from_info_cnv(self):
        """ test that set_add_gene_from_info() works correctly
        """

        # make sure the known genes are None, otherwise sometimes the values
        # from test_variant_info.py unit tests can bleed through. I'm not sure
        # why!
        self.var.known_genes = None

        # check that HGNC takes precedence
        self.var.info["HGNC"] = "A"
        genes = self.var.info.parse_gene_symbols(self.var.alt_alleles, [])
        self.assertEqual(genes, [Symbols(info={'HGNC': 'A'}, idx=0)])

        # check that HGNC_ALL doesn't affect anything
        self.var.info["HGNC_ALL"] = "B"
        del self.var.info["HGNC"]
        genes = self.var.info.parse_gene_symbols(self.var.alt_alleles, [])
        self.assertEqual(genes, [Symbols(info={}, idx=0)])
Пример #6
0
    def test_parse_gene_symbols_missing_gene(self):
        ''' check the gene symbol is the genome pos when we lack any other info
        '''

        # remove the only possibly source of the gene symbol
        info = Info('')
        alts = ('C', )

        genes = info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={}, idx=0)])
Пример #7
0
    def test_parse_gene_symbols_multi_alts(self):
        ''' check parse_gene_symbols() when we have multiple alleles
        '''

        info = Info(
            'HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E'
        )
        alts = ('G', 'C')

        self.assertEqual(info.parse_gene_symbols(alts, []), [
            Symbols(info={
                'HGNC_ID': 'D',
                'HGNC': 'D',
                'SYMBOL': 'D',
                'ENSG': 'D',
                'ENST': 'D',
                'ENSP': 'D',
                'ENSR': 'D'
            },
                    idx=0),
            Symbols(info={
                'HGNC_ID': 'E',
                'HGNC': 'E',
                'SYMBOL': 'E',
                'ENSG': 'E',
                'ENST': 'E',
                'ENSP': 'E',
                'ENSR': 'E'
            },
                    idx=0)
        ])

        # if we have more alleles than the available symbols, we get an error
        # NOTE: this doesn't check if we have fewer alleles than symbols
        alts = ('G', 'T', 'C')
        with self.assertRaises(IndexError):
            self.info.parse_gene_symbols(alts, [])
Пример #8
0
    def test_get_per_gene_consequence(self):
        """ test that get_per_gene_consequence works correctly
        """

        self.info.symbols = [Symbols(info={'HGNC': 'ATRX'}, idx=0)]
        self.info.consequence = [["missense_variant"]]

        self.assertEqual(self.info.get_per_gene_consequence(None),
                         ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"),
                         ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("TEST"), [])

        # check a variant with consequences in multiple genes, that we only
        # pull out the consequencesquences for a single gene
        self.info.symbols = [Symbols(info={'HGNC': 'ATRX|TTN'}, idx=0)]
        self.info.consequence = [["missense_variant", "synonymous_variant"]]
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"),
                         ["missense_variant"])
        self.assertEqual(self.info.get_per_gene_consequence("TTN"),
                         ["synonymous_variant"])

        # check a symbol where two symbols match, we only use the first consequence
        self.info.symbols = [Symbols(info={'HGNC': 'TEMP|ATRX|TEMP'}, idx=0)]
        self.info.consequence = [[
            "splice_acceptor_variant", "missense_variant", "synonymous_variant"
        ]]
        self.assertEqual(self.info.get_per_gene_consequence("TEMP"),
                         ["splice_acceptor_variant"])

        # check a symbol with some None gene symbols
        self.info.symbols = [Symbols(info={'HGNC': '|ATRX|'}, idx=0)]
        self.info.consequence = [[
            "splice_acceptor_variant", "missense_variant", "synonymous_variant"
        ]]
        self.assertEqual(self.info.get_per_gene_consequence("ATRX"),
                         ["missense_variant"])
Пример #9
0
    def parse_gene_symbols(self, alts, masked):
        """ parses the available gene symbols in the INFO.
        
        Args:
            alts: list of alternative alleles for the variant
            masked: list of alternative alleles that we don't consider. These
                are identified as alt alleles with zero depth in the individual.
                This can occur due to multi-sample calling.
        
        Returns:
            list of gene lists, one per alternative allele (after removing the
            masked alt alleles.)
        """

        pos = [i for i, x in enumerate(alts) if x not in masked]
        return [Symbols(self.info, i) for i in pos]
Пример #10
0
    def test_parse_gene_symbols(self):
        """ test that parse_gene_symbols() works correctly
        """

        alts = ('C', )

        # check for when a HGNC key exists
        self.info["HGNC_ID"] = "A"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A'}, idx=0)])

        # check for when a HGNC key doesn't exist
        del self.info["HGNC_ID"]
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={}, idx=0)])

        # check for multiple gene symbols
        self.info["HGNC_ID"] = "A|B|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A|B|C'}, idx=0)])

        # check for multiple gene symbols, when some are missing
        self.info["HGNC_ID"] = "|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C'}, idx=0)])

        # check for multiple gene symbols, when some missing symbols have
        # alternates in other symbol fields.
        self.info["HGNC_ID"] = ".|.|C"
        self.info["HGNC"] = "Z|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(
            genes, [Symbols(info={
                'HGNC_ID': '||C',
                'HGNC': 'Z||C'
            }, idx=0)])

        # Check that including alternate symbols has the correct precendence
        # order. Note that doing this properly would require checking all of the
        # possible order combinations.
        self.info["HGNC_ID"] = ".|.|C"
        self.info["HGNC"] = "Z|.|C"
        self.info["SYMBOL"] = "A|.|C"
        genes = self.info.parse_gene_symbols(alts, [])
        self.assertEqual(genes, [
            Symbols(info={
                'HGNC_ID': '||C',
                'HGNC': 'Z||C',
                "SYMBOL": "A||C"
            },
                    idx=0)
        ])
Пример #11
0
    def test_second_allele(self):
        ''' test that we can set gene symbols from the second allele
        '''
        info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
        symbols = Symbols(info, 1)

        self.assertEqual(symbols.get('C'), '3')

        # test that if a symbol is missing, we skip to an alternate
        self.assertEqual(symbols.get('D'), 'D')

        # check that we can't retrieve anything for a symbol in the other allele
        with self.assertRaises(KeyError):
            symbols.get('A')
Пример #12
0
    def test_parse_gene_symbols_multi_alts_masked_alt(self):
        ''' check parse_gene_symbols() when we mask alt alleles
        '''

        info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \
            'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y')
        alts = ('G', 'C')

        # mask one allele
        self.assertEqual(info.parse_gene_symbols(alts, ['C']), [
            Symbols(info={
                'HGNC_ID': 'D|X',
                'HGNC': 'D|X',
                'SYMBOL': 'D|X',
                'ENSG': 'D|X',
                'ENST': 'D|X',
                'ENSP': 'D|X',
                'ENSR': 'D|X'
            },
                    idx=0)
        ])

        # mask both alleles
        self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']), [])
class TestVariantSymbolsPy(unittest.TestCase):
    """  unit testing of the Symbols class
    """
    
    def setUp(self):
        info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
        self.symbols = Symbols(info, 0)
    
    def test___repr__(self):
        ''' test Symbols repr
        '''
        self.assertEqual(repr(self.symbols), "Symbols(info={'ENSG': '|', " \
            "'ENSP': '|', 'ENSR': '|', 'ENST': '|', 'HGNC': 'A|B', " \
            "'HGNC_ID': '1|2', 'SYMBOL': 'Z|H'}, idx=0)")
    
    def test_prioritise(self):
        ''' test that we correctly prioritise gene symbols
        '''
        self.assertEqual(self.symbols.prioritise(), ['1', '2'])
        self.assertEqual(self.symbols.prioritise(priority=['HGNC']), ['A', 'B'])
        self.assertEqual(self.symbols.prioritise(priority=['ENST', 'HGNC']), ['A', 'B'])
    
    def test_get_preferred(self):
        ''' tets that we can get a symbol, prioritising by symbol type
        '''
        values = {'HGNC': 'A', 'HGNC_ID': '1', 'SYMBOL': 'Z', 'ENSG': None,
            'ENST': None, 'ENSP': None, 'ENSR': None}
        
        # defaul to HGNC_ID first
        self.assertEqual(self.symbols.get_preferred(values), '1')
        
        # if we provide a list of symbols, check that order instead
        self.assertEqual(self.symbols.get_preferred(values, ['HGNC']), 'A')
        self.assertEqual(self.symbols.get_preferred(values, ['ENST']), None)
        
        # run through the list of preferred symbol types until we hit the end,
        # or get a non-None value
        self.assertEqual(self.symbols.get_preferred(values, ['ENST', 'HGNC']), 'A')
    
    def test_get(self):
        ''' test that we can retrieve gene symbols
        '''
        self.assertEqual(self.symbols.get('A'), '1')
        self.assertEqual(self.symbols.get('A', ['SYMBOL']), 'Z')
        
        self.assertEqual(self.symbols.get('A', 'ENST'), None)
        self.assertEqual(self.symbols.get('A', ['ENST']), None)
        self.assertEqual(self.symbols.get('A', ['ENST', 'SYMBOL']), 'Z')
        
        with self.assertRaises(KeyError):
            self.symbols.get('C', ['ENST'])
            self.symbols.get('A', 'UNKNOWN')
    
    def test_second_allele(self):
        ''' test that we can set gene symbols from the second allele
        '''
        info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
        symbols = Symbols(info, 1)
        
        self.assertEqual(symbols.get('C'), '3')
        
        # test that if a symbol is missing, we skip to an alternate
        self.assertEqual(symbols.get('D'), 'D')
        
        # check that we can't retrieve anything for a symbol in the other allele
        with self.assertRaises(KeyError):
            symbols.get('A')
    
    def test_out_of_index_allele(self):
        ''' raise an error if we construct a class for a non-exitent allele
        '''
        info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
        with self.assertRaises(IndexError):
            Symbols(info, 2)
    
    def test_set(self):
        ''' test we can set gene symbols after the class has been instantiated
        '''
        
        # try a key that does not currently exist
        self.assertEqual(self.symbols.get('A', 'ENST'), None)
        
        # set the key
        self.symbols.set('A', 'H', 'ENST')
        self.assertEqual(self.symbols.get('A', 'ENST'), 'H')
        
        # overwrite an existsing key
        self.symbols.set('A', 'O', 'HGNC')
        self.assertEqual(self.symbols.get('O', 'HGNC'), 'O')
        
        with self.assertRaises(KeyError):
            self.symbols.get('A')
Пример #14
0
 def setUp(self):
     info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
     self.symbols = Symbols(info, 0)
Пример #15
0
class TestVariantSymbolsPy(unittest.TestCase):
    """  unit testing of the Symbols class
    """
    def setUp(self):
        info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
        self.symbols = Symbols(info, 0)

    def test___repr__(self):
        ''' test Symbols repr
        '''
        self.assertEqual(repr(self.symbols), "Symbols(info={'ENSG': '|', " \
            "'ENSP': '|', 'ENSR': '|', 'ENST': '|', 'HGNC': 'A|B', " \
            "'HGNC_ID': '1|2', 'SYMBOL': 'Z|H'}, idx=0)")

    def test_prioritise(self):
        ''' test that we correctly prioritise gene symbols
        '''
        self.assertEqual(self.symbols.prioritise(), ['1', '2'])
        self.assertEqual(self.symbols.prioritise(priority=['HGNC']),
                         ['A', 'B'])
        self.assertEqual(self.symbols.prioritise(priority=['ENST', 'HGNC']),
                         ['A', 'B'])

    def test_get_preferred(self):
        ''' tets that we can get a symbol, prioritising by symbol type
        '''
        values = {
            'HGNC': 'A',
            'HGNC_ID': '1',
            'SYMBOL': 'Z',
            'ENSG': None,
            'ENST': None,
            'ENSP': None,
            'ENSR': None
        }

        # defaul to HGNC_ID first
        self.assertEqual(self.symbols.get_preferred(values), '1')

        # if we provide a list of symbols, check that order instead
        self.assertEqual(self.symbols.get_preferred(values, ['HGNC']), 'A')
        self.assertEqual(self.symbols.get_preferred(values, ['ENST']), None)

        # run through the list of preferred symbol types until we hit the end,
        # or get a non-None value
        self.assertEqual(self.symbols.get_preferred(values, ['ENST', 'HGNC']),
                         'A')

    def test_get(self):
        ''' test that we can retrieve gene symbols
        '''
        self.assertEqual(self.symbols.get('A'), '1')
        self.assertEqual(self.symbols.get('A', ['SYMBOL']), 'Z')

        self.assertEqual(self.symbols.get('A', 'ENST'), None)
        self.assertEqual(self.symbols.get('A', ['ENST']), None)
        self.assertEqual(self.symbols.get('A', ['ENST', 'SYMBOL']), 'Z')

        with self.assertRaises(KeyError):
            self.symbols.get('C', ['ENST'])
            self.symbols.get('A', 'UNKNOWN')

    def test_second_allele(self):
        ''' test that we can set gene symbols from the second allele
        '''
        info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
        symbols = Symbols(info, 1)

        self.assertEqual(symbols.get('C'), '3')

        # test that if a symbol is missing, we skip to an alternate
        self.assertEqual(symbols.get('D'), 'D')

        # check that we can't retrieve anything for a symbol in the other allele
        with self.assertRaises(KeyError):
            symbols.get('A')

    def test_out_of_index_allele(self):
        ''' raise an error if we construct a class for a non-exitent allele
        '''
        info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
        with self.assertRaises(IndexError):
            Symbols(info, 2)

    def test_set(self):
        ''' test we can set gene symbols after the class has been instantiated
        '''

        # try a key that does not currently exist
        self.assertEqual(self.symbols.get('A', 'ENST'), None)

        # set the key
        self.symbols.set('A', 'H', 'ENST')
        self.assertEqual(self.symbols.get('A', 'ENST'), 'H')

        # overwrite an existsing key
        self.symbols.set('A', 'O', 'HGNC')
        self.assertEqual(self.symbols.get('O', 'HGNC'), 'O')

        with self.assertRaises(KeyError):
            self.symbols.get('A')
Пример #16
0
 def test_out_of_index_allele(self):
     ''' raise an error if we construct a class for a non-exitent allele
     '''
     info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
     with self.assertRaises(IndexError):
         Symbols(info, 2)
 def setUp(self):
     info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'}
     self.symbols = Symbols(info, 0)