def test_parse_gene_symbols_multi_alts_multi_symbols(self): ''' check parse_gene_symbols() when we have multiple symbols per allele ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [ Symbols(info={ 'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X' }, idx=0), Symbols(info={ 'HGNC_ID': 'E|Y', 'HGNC': 'E|Y', 'SYMBOL': 'E|Y', 'ENSG': 'E|Y', 'ENST': 'E|Y', 'ENSP': 'E|Y', 'ENSR': 'E|Y' }, idx=0) ])
def test_second_allele(self): ''' test that we can set gene symbols from the second allele ''' info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} symbols = Symbols(info, 1) self.assertEqual(symbols.get('C'), '3') # test that if a symbol is missing, we skip to an alternate self.assertEqual(symbols.get('D'), 'D') # check that we can't retrieve anything for a symbol in the other allele with self.assertRaises(KeyError): symbols.get('A')
def test_get_genes(self): """ test that get_genes() works correctly """ self.var.info.symbols = [Symbols(info={}, idx=0)] self.assertEqual(self.var.info.get_genes(), [[]]) self.var.info.symbols = [Symbols(info={'HGNC': 'TEST'}, idx=0)] self.assertEqual(self.var.info.get_genes(), [["TEST"]]) self.var.info.symbols = [Symbols(info={'HGNC': 'TEST1|TEST2'}, idx=0)] self.assertEqual(self.var.info.get_genes(), [["TEST1", "TEST2"]]) self.var.info.symbols = [Symbols(info={'HGNC': '.'}, idx=0)] self.assertEqual(self.var.info.get_genes(), [[None]])
def test_fix_gene_IDs(self): """ test that fix_gene_IDs() works correctly """ self.var.known_genes = { "TEST": { "start": 1000, "end": 2000, "chrom": "5" } } # make a CNV that will overlap with the known gene set self.var.info.symbols = [Symbols(info={'HGNC_ID': 'TEST'}, idx=0)] self.var.position = 1000 self.var.info["END"] = "1500" # check that fixing gene names does not alter anything for a CNV in a # single known gene self.var.fix_gene_IDs() self.assertEqual(self.var.info.get_genes(), [['TEST']]) # check that fixing gene names does not alter names not in the gene dict self.var.info.symbols = [ Symbols(info={'HGNC_ID': 'TEST|TEST2'}, idx=0) ] self.var.fix_gene_IDs() self.assertEqual(self.var.info.get_genes(), [['TEST', 'TEST2']]) # check that fixing gene names drop name of genes where the name is in # the known genes dict, and the CNV and gene do not overlap self.var.position = 900 self.var.info["END"] = "950" self.var.fix_gene_IDs() self.assertEqual(self.var.info.get_genes(), [[None, 'TEST2']]) # check that when we do not have any known genes, the gene names are # unaltered self.var.info.symbols = [ Symbols(info={'HGNC_ID': 'TEST|TEST2'}, idx=0) ] self.var.known_genes = None self.var.fix_gene_IDs() self.assertEqual(self.var.info.get_genes(), [['TEST', 'TEST2']])
def test_set_gene_from_info_cnv(self): """ test that set_add_gene_from_info() works correctly """ # make sure the known genes are None, otherwise sometimes the values # from test_variant_info.py unit tests can bleed through. I'm not sure # why! self.var.known_genes = None # check that HGNC takes precedence self.var.info["HGNC"] = "A" genes = self.var.info.parse_gene_symbols(self.var.alt_alleles, []) self.assertEqual(genes, [Symbols(info={'HGNC': 'A'}, idx=0)]) # check that HGNC_ALL doesn't affect anything self.var.info["HGNC_ALL"] = "B" del self.var.info["HGNC"] genes = self.var.info.parse_gene_symbols(self.var.alt_alleles, []) self.assertEqual(genes, [Symbols(info={}, idx=0)])
def test_parse_gene_symbols_missing_gene(self): ''' check the gene symbol is the genome pos when we lack any other info ''' # remove the only possibly source of the gene symbol info = Info('') alts = ('C', ) genes = info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={}, idx=0)])
def test_parse_gene_symbols_multi_alts(self): ''' check parse_gene_symbols() when we have multiple alleles ''' info = Info( 'HGNC_ID=D,E;HGNC=D,E;SYMBOL=D,E;ENSG=D,E;ENST=D,E;ENSP=D,E;ENSR=D,E' ) alts = ('G', 'C') self.assertEqual(info.parse_gene_symbols(alts, []), [ Symbols(info={ 'HGNC_ID': 'D', 'HGNC': 'D', 'SYMBOL': 'D', 'ENSG': 'D', 'ENST': 'D', 'ENSP': 'D', 'ENSR': 'D' }, idx=0), Symbols(info={ 'HGNC_ID': 'E', 'HGNC': 'E', 'SYMBOL': 'E', 'ENSG': 'E', 'ENST': 'E', 'ENSP': 'E', 'ENSR': 'E' }, idx=0) ]) # if we have more alleles than the available symbols, we get an error # NOTE: this doesn't check if we have fewer alleles than symbols alts = ('G', 'T', 'C') with self.assertRaises(IndexError): self.info.parse_gene_symbols(alts, [])
def test_get_per_gene_consequence(self): """ test that get_per_gene_consequence works correctly """ self.info.symbols = [Symbols(info={'HGNC': 'ATRX'}, idx=0)] self.info.consequence = [["missense_variant"]] self.assertEqual(self.info.get_per_gene_consequence(None), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("TEST"), []) # check a variant with consequences in multiple genes, that we only # pull out the consequencesquences for a single gene self.info.symbols = [Symbols(info={'HGNC': 'ATRX|TTN'}, idx=0)] self.info.consequence = [["missense_variant", "synonymous_variant"]] self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"]) self.assertEqual(self.info.get_per_gene_consequence("TTN"), ["synonymous_variant"]) # check a symbol where two symbols match, we only use the first consequence self.info.symbols = [Symbols(info={'HGNC': 'TEMP|ATRX|TEMP'}, idx=0)] self.info.consequence = [[ "splice_acceptor_variant", "missense_variant", "synonymous_variant" ]] self.assertEqual(self.info.get_per_gene_consequence("TEMP"), ["splice_acceptor_variant"]) # check a symbol with some None gene symbols self.info.symbols = [Symbols(info={'HGNC': '|ATRX|'}, idx=0)] self.info.consequence = [[ "splice_acceptor_variant", "missense_variant", "synonymous_variant" ]] self.assertEqual(self.info.get_per_gene_consequence("ATRX"), ["missense_variant"])
def parse_gene_symbols(self, alts, masked): """ parses the available gene symbols in the INFO. Args: alts: list of alternative alleles for the variant masked: list of alternative alleles that we don't consider. These are identified as alt alleles with zero depth in the individual. This can occur due to multi-sample calling. Returns: list of gene lists, one per alternative allele (after removing the masked alt alleles.) """ pos = [i for i, x in enumerate(alts) if x not in masked] return [Symbols(self.info, i) for i in pos]
def test_parse_gene_symbols(self): """ test that parse_gene_symbols() works correctly """ alts = ('C', ) # check for when a HGNC key exists self.info["HGNC_ID"] = "A" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A'}, idx=0)]) # check for when a HGNC key doesn't exist del self.info["HGNC_ID"] genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={}, idx=0)]) # check for multiple gene symbols self.info["HGNC_ID"] = "A|B|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': 'A|B|C'}, idx=0)]) # check for multiple gene symbols, when some are missing self.info["HGNC_ID"] = "|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [Symbols(info={'HGNC_ID': '||C'}, idx=0)]) # check for multiple gene symbols, when some missing symbols have # alternates in other symbol fields. self.info["HGNC_ID"] = ".|.|C" self.info["HGNC"] = "Z|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual( genes, [Symbols(info={ 'HGNC_ID': '||C', 'HGNC': 'Z||C' }, idx=0)]) # Check that including alternate symbols has the correct precendence # order. Note that doing this properly would require checking all of the # possible order combinations. self.info["HGNC_ID"] = ".|.|C" self.info["HGNC"] = "Z|.|C" self.info["SYMBOL"] = "A|.|C" genes = self.info.parse_gene_symbols(alts, []) self.assertEqual(genes, [ Symbols(info={ 'HGNC_ID': '||C', 'HGNC': 'Z||C', "SYMBOL": "A||C" }, idx=0) ])
def test_parse_gene_symbols_multi_alts_masked_alt(self): ''' check parse_gene_symbols() when we mask alt alleles ''' info = Info('HGNC_ID=D|X,E|Y;HGNC=D|X,E|Y;SYMBOL=D|X,E|Y;ENSG=D|X,E|Y;' \ 'ENST=D|X,E|Y;ENSP=D|X,E|Y;ENSR=D|X,E|Y') alts = ('G', 'C') # mask one allele self.assertEqual(info.parse_gene_symbols(alts, ['C']), [ Symbols(info={ 'HGNC_ID': 'D|X', 'HGNC': 'D|X', 'SYMBOL': 'D|X', 'ENSG': 'D|X', 'ENST': 'D|X', 'ENSP': 'D|X', 'ENSR': 'D|X' }, idx=0) ]) # mask both alleles self.assertEqual(info.parse_gene_symbols(alts, ['C', 'G']), [])
class TestVariantSymbolsPy(unittest.TestCase): """ unit testing of the Symbols class """ def setUp(self): info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} self.symbols = Symbols(info, 0) def test___repr__(self): ''' test Symbols repr ''' self.assertEqual(repr(self.symbols), "Symbols(info={'ENSG': '|', " \ "'ENSP': '|', 'ENSR': '|', 'ENST': '|', 'HGNC': 'A|B', " \ "'HGNC_ID': '1|2', 'SYMBOL': 'Z|H'}, idx=0)") def test_prioritise(self): ''' test that we correctly prioritise gene symbols ''' self.assertEqual(self.symbols.prioritise(), ['1', '2']) self.assertEqual(self.symbols.prioritise(priority=['HGNC']), ['A', 'B']) self.assertEqual(self.symbols.prioritise(priority=['ENST', 'HGNC']), ['A', 'B']) def test_get_preferred(self): ''' tets that we can get a symbol, prioritising by symbol type ''' values = {'HGNC': 'A', 'HGNC_ID': '1', 'SYMBOL': 'Z', 'ENSG': None, 'ENST': None, 'ENSP': None, 'ENSR': None} # defaul to HGNC_ID first self.assertEqual(self.symbols.get_preferred(values), '1') # if we provide a list of symbols, check that order instead self.assertEqual(self.symbols.get_preferred(values, ['HGNC']), 'A') self.assertEqual(self.symbols.get_preferred(values, ['ENST']), None) # run through the list of preferred symbol types until we hit the end, # or get a non-None value self.assertEqual(self.symbols.get_preferred(values, ['ENST', 'HGNC']), 'A') def test_get(self): ''' test that we can retrieve gene symbols ''' self.assertEqual(self.symbols.get('A'), '1') self.assertEqual(self.symbols.get('A', ['SYMBOL']), 'Z') self.assertEqual(self.symbols.get('A', 'ENST'), None) self.assertEqual(self.symbols.get('A', ['ENST']), None) self.assertEqual(self.symbols.get('A', ['ENST', 'SYMBOL']), 'Z') with self.assertRaises(KeyError): self.symbols.get('C', ['ENST']) self.symbols.get('A', 'UNKNOWN') def test_second_allele(self): ''' test that we can set gene symbols from the second allele ''' info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} symbols = Symbols(info, 1) self.assertEqual(symbols.get('C'), '3') # test that if a symbol is missing, we skip to an alternate self.assertEqual(symbols.get('D'), 'D') # check that we can't retrieve anything for a symbol in the other allele with self.assertRaises(KeyError): symbols.get('A') def test_out_of_index_allele(self): ''' raise an error if we construct a class for a non-exitent allele ''' info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} with self.assertRaises(IndexError): Symbols(info, 2) def test_set(self): ''' test we can set gene symbols after the class has been instantiated ''' # try a key that does not currently exist self.assertEqual(self.symbols.get('A', 'ENST'), None) # set the key self.symbols.set('A', 'H', 'ENST') self.assertEqual(self.symbols.get('A', 'ENST'), 'H') # overwrite an existsing key self.symbols.set('A', 'O', 'HGNC') self.assertEqual(self.symbols.get('O', 'HGNC'), 'O') with self.assertRaises(KeyError): self.symbols.get('A')
def setUp(self): info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} self.symbols = Symbols(info, 0)
class TestVariantSymbolsPy(unittest.TestCase): """ unit testing of the Symbols class """ def setUp(self): info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} self.symbols = Symbols(info, 0) def test___repr__(self): ''' test Symbols repr ''' self.assertEqual(repr(self.symbols), "Symbols(info={'ENSG': '|', " \ "'ENSP': '|', 'ENSR': '|', 'ENST': '|', 'HGNC': 'A|B', " \ "'HGNC_ID': '1|2', 'SYMBOL': 'Z|H'}, idx=0)") def test_prioritise(self): ''' test that we correctly prioritise gene symbols ''' self.assertEqual(self.symbols.prioritise(), ['1', '2']) self.assertEqual(self.symbols.prioritise(priority=['HGNC']), ['A', 'B']) self.assertEqual(self.symbols.prioritise(priority=['ENST', 'HGNC']), ['A', 'B']) def test_get_preferred(self): ''' tets that we can get a symbol, prioritising by symbol type ''' values = { 'HGNC': 'A', 'HGNC_ID': '1', 'SYMBOL': 'Z', 'ENSG': None, 'ENST': None, 'ENSP': None, 'ENSR': None } # defaul to HGNC_ID first self.assertEqual(self.symbols.get_preferred(values), '1') # if we provide a list of symbols, check that order instead self.assertEqual(self.symbols.get_preferred(values, ['HGNC']), 'A') self.assertEqual(self.symbols.get_preferred(values, ['ENST']), None) # run through the list of preferred symbol types until we hit the end, # or get a non-None value self.assertEqual(self.symbols.get_preferred(values, ['ENST', 'HGNC']), 'A') def test_get(self): ''' test that we can retrieve gene symbols ''' self.assertEqual(self.symbols.get('A'), '1') self.assertEqual(self.symbols.get('A', ['SYMBOL']), 'Z') self.assertEqual(self.symbols.get('A', 'ENST'), None) self.assertEqual(self.symbols.get('A', ['ENST']), None) self.assertEqual(self.symbols.get('A', ['ENST', 'SYMBOL']), 'Z') with self.assertRaises(KeyError): self.symbols.get('C', ['ENST']) self.symbols.get('A', 'UNKNOWN') def test_second_allele(self): ''' test that we can set gene symbols from the second allele ''' info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} symbols = Symbols(info, 1) self.assertEqual(symbols.get('C'), '3') # test that if a symbol is missing, we skip to an alternate self.assertEqual(symbols.get('D'), 'D') # check that we can't retrieve anything for a symbol in the other allele with self.assertRaises(KeyError): symbols.get('A') def test_out_of_index_allele(self): ''' raise an error if we construct a class for a non-exitent allele ''' info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} with self.assertRaises(IndexError): Symbols(info, 2) def test_set(self): ''' test we can set gene symbols after the class has been instantiated ''' # try a key that does not currently exist self.assertEqual(self.symbols.get('A', 'ENST'), None) # set the key self.symbols.set('A', 'H', 'ENST') self.assertEqual(self.symbols.get('A', 'ENST'), 'H') # overwrite an existsing key self.symbols.set('A', 'O', 'HGNC') self.assertEqual(self.symbols.get('O', 'HGNC'), 'O') with self.assertRaises(KeyError): self.symbols.get('A')
def test_out_of_index_allele(self): ''' raise an error if we construct a class for a non-exitent allele ''' info = {'HGNC': 'A|B,C|D', 'HGNC_ID': '1|2,3|', 'SYMBOL': 'Z|H,|'} with self.assertRaises(IndexError): Symbols(info, 2)