def test_add_choice(self): """ test that add_choice() works correctly """ # check the cumulative sum while adding in new values choices = WeightedChoice() choices.add_choice(1, 1) self.assertEqual(choices.get_summed_rate(), 1) choices.add_choice(2, 5) self.assertEqual(choices.get_summed_rate(), 6) choices.add_choice(3, 10) self.assertEqual(choices.get_summed_rate(), 16) # check that it works for unsorted probabilities choices = WeightedChoice() choices.add_choice(1, 1) choices.add_choice(2, 10) choices.add_choice(3, 5) self.assertEqual(choices.get_summed_rate(), 16) # check for very low values, with very high precision (but not # necessarily exactly equal) choices = WeightedChoice() choices.add_choice(1, 5e-9) choices.add_choice(2, 1e-8) choices.add_choice(3, 1.000000000000005e-10) self.assertAlmostEqual(choices.get_summed_rate(), 1.51000000000000005e-8, places=23)
def test_choice_small_numbers(self): """ test that choice() works correctly. """ iterations = 1000000 # very small numbers at the end still have expected proportions choices = WeightedChoice() choices.add_choice(1, 1) choices.add_choice(2, 5) choices.add_choice(3, 0.0001) s = [choices.choice() for x in range(iterations)] self.assertAlmostEqual(s.count(3) / len(s), 0.0001, places=3) # very small numbers at the start still have expected proportions choices = WeightedChoice() choices.add_choice(1, 0.0001) choices.add_choice(2, 1) choices.add_choice(3, 5) s = [choices.choice() for x in range(iterations)] self.assertAlmostEqual(s.count(1) / len(s), 0.0001, places=3) # check that the sampling works correctly at low weight values choices = WeightedChoice() numbers = range(1000, 3000) small = [x * 0.000000000001 for x in numbers] for (name, prob) in zip(numbers, small): choices.add_choice(name, prob) s = [choices.choice() for x in range(iterations)] self.assertAlmostEqual(s.count(numbers[0]) / len(s), 0.0001, places=3)
def test_choice(self): """ test that choice() works correctly. Since WeightedChoice is a weighted random sampler, we can't rely on getting exact values out, so repeated samples are expected to obtain proportions of values equivalent to their weight value. The difference to the expected proportion minimises with larger sample sets, but at the cost of making the test hang for > 1 second for 1 million samples, or > 10 s for 10 million samples. """ iterations = 1000000 choices = WeightedChoice() choices.add_choice(1, 1) choices.add_choice(2, 5) s = [choices.choice() for x in range(iterations)] self.assertAlmostEqual(s.count(1) / len(s), 0.1667, places=2) # add another choice, then check that all of the choices have been # sampled at the expecetd proportions choices.add_choice(3, 4) s = [choices.choice() for x in range(iterations)] self.assertAlmostEqual(s.count(1) / len(s), 0.100, places=2) self.assertAlmostEqual(s.count(2) / len(s), 0.500, places=2) self.assertAlmostEqual(s.count(3) / len(s), 0.400, places=2) # check that all the choices have been made from the inserted values self.assertEqual(set(s), set([1, 2, 3]))
def test_append(self): """ test that append() works correctly """ # construct two objects a = WeightedChoice() a.add_choice(1, 0.5) b = WeightedChoice() b.add_choice(2, 1) # add one object to the other a.append(b) # check that the first object has changed correctly, but the other # remains unchanged self.assertEqual(a.get_summed_rate(), 1.5) self.assertEqual(b.get_summed_rate(), 1.0)
def test_analyse_sample_zero(self): ''' test we raise an error if the de novo count is zero ''' rates = WeightedChoice() rates.add_choice(200, 1e-5, 'A', 'G') rates.add_choice(201, 2e-5, 'C', 'T') severity = [5, 10] with self.assertRaises(ValueError): analyse(rates, severity, 0, 0, iterations=10000)
def test_choice_with_alleles(self): """ test that choice_with_alleles() works correctly. """ # if you add a choice with alleles, then check that we get back alleles, # and that they are the same choices = WeightedChoice() choices.add_choice(1, 1, "A", "T") self.assertEqual(choices.choice_with_alleles(), { 'alt': 'T', 'ref': 'A', 'pos': 1, 'offset': 0 }) self.assertEqual(choices.choice(), 1) # if you add choices without alleles, then default the alleles to "N" choices = WeightedChoice() choices.add_choice(1, 1) self.assertEqual(choices.choice_with_alleles(), { 'alt': 'N', 'ref': 'N', 'pos': 1, 'offset': 0 }) # make sure you can't add multi-base alleles to the choices with self.assertRaises(TypeError): choices.add_choice(1, 1, "AA", "A") choices.add_choice(1, 1, "A", "AG") # make sure non-zero offsets are returned corectly choices = WeightedChoice() choices.add_choice(1, 1, "A", "T", 3) self.assertEqual(choices.choice_with_alleles(), { 'alt': 'T', 'ref': 'A', 'pos': 1, 'offset': 3 }) self.assertEqual(choices.choice(), 1)
def test_analyse_mismatch(self): ''' test for error when the rates and severity lengths are different ''' rates = WeightedChoice() rates.add_choice(200, 1e-5, 'A', 'G') rates.add_choice(201, 2e-5, 'C', 'T') severity = [5, 10, 5] with self.assertRaises(ValueError): analyse(rates, severity, 8, 1, iterations=100000)
def test___init__(self): """ check that __init__() initiates the object correctly """ choices = WeightedChoice() # check that an object without any possible choices has a cumulative # sum of 0, but returns a choice of -1 self.assertEqual(choices.get_summed_rate(), 0) self.assertEqual(choices.choice(), -1) # check that the type is set correctly self.assertEqual(type(choices), WeightedChoice)
def test_analyse_bigger(self): ''' test a more realistically sized data set ''' seed(0) rates = WeightedChoice() pos = sorted(set([randint(1000, 3000) for x in range(2000)])) for x in pos: rates.add_choice(x, uniform(1e-10, 1e-7), 'A', 'G') severity = [randint(0, 40) for x in pos] p = analyse(rates, severity, 150, 4, iterations=10000) self.assertAlmostEqual(p, 3e-4, places=2)
def analyse_gene(ensembl, mut_dict, cadd, symbol, de_novos, constraint, weights): ''' analyse the severity of de novos found in a gene Args: ensembl: EnsemblRequest object, for transcript coordinates and sequence mut_dict: list of sequence-context mutation probabilities. cadd: pysam.TabixFile object for CADD scores (SNVs only) symbol: HGNC symbol for current gene de_novos: list of de novo mutations observed in current gene. Each entry is a dict with 'position', 'ref', 'alt', and 'consequence' keys. weights: dictionary of objects to weight CADD severity scores. We have different weights for protein-truncating and protein-altering variants, and within the protein-altering variants, different weights for variants in constrained and unconstrained regions. Returns: p-value for the observed total severity with respect to a null distribution of severities for the gene. ''' sites = [x['pos'] for x in de_novos] try: # create gene/transcript for de novo mutations transcripts = load_gene(ensembl, symbol, sites) except IndexError: return 'NA' # get per site/allele mutation rates rates_by_cq = get_site_sampler(transcripts, mut_dict) chrom = transcripts[0].get_chrom() # get per site/allele severity scores, weighted by enrichment of missense # in known dominant at different severity thresholds constrained = get_constrained_positions(ensembl, constraint, symbol) severity = get_severity(cadd, chrom, rates_by_cq, weights, constrained) # convert the rates per site per consequence to rates per site rates = WeightedChoice() for cq in sorted(rates_by_cq): rates.append(rates_by_cq[cq]) # get summed score for observed de novos observed = sum((get_severity(cadd, chrom, de_novos, weights, constrained))) # simulate distribution of summed scores within transcript return analyse(rates, severity, observed, len(de_novos), 1000000)
def test_analyse(self): ''' test that we run the simulations correctly ''' rates = WeightedChoice() rates.add_choice(200, 1e-5, 'A', 'G') rates.add_choice(201, 2e-5, 'C', 'T') rates.add_choice(202, 1e-5, 'C', 'G') severity = [5, 10, 5] # define a test where the observed score will fall at the midpoint of # the simulated null distribution p = analyse(rates, severity, 8, 1, iterations=100000) self.assertAlmostEqual(p, 0.5, places=2) # now check when we sample two de novo mutations p = analyse(rates, severity, 15, 2, iterations=100000) self.assertAlmostEqual(p, 0.25, places=2)
def get_site_sampler(transcripts, mut_dict): ''' get per position and alt allele mutation probability sampler. We need to be able to sample each site within a gene, where the probability of sampling a given site is equal to the sequence-context derived mutation probability. We use the denovonear.weights.WeightedChoice for this, which wraps around a cpp class for quick sampling. We use the SiteRates class to derive the per site/allele probabilities for different consequence categories. We combine the categories of interest intoa single object, so we can sample across the full transcript at once. This also allows for multiple transcripts for a single gene, by taking the union of transcripts. Args: transcripts: list of Transcript objects for a gene. mut_dict: list of sequence-context mutation probabilities. Returns: denovonear.WeightedChoice object, containing the mutation probabilities per position and alt allele. ''' consequences = ['nonsense', 'missense', 'splice_lof'] all_rates = {} for cq in consequences: all_rates[cq] = WeightedChoice() combined_tx = None for tx in transcripts: rates = SiteRates(tx, mut_dict, masked_sites=combined_tx, cds_coords=False) if combined_tx is None: combined_tx = tx else: combined_tx += tx for cq in consequences: all_rates[cq].append(rates[cq]) return all_rates
def test_analyse_extreme_p_value(self): ''' test when the observed severity score exceeds all possible values ''' rates = WeightedChoice() rates.add_choice(200, 1e-5, 'A', 'G') rates.add_choice(201, 2e-5, 'C', 'T') rates.add_choice(202, 1e-5, 'C', 'G') severity = [5, 10, 5] # now check when the observed severity score exceeds all possible # values from the severity distribution. This test gives an absurd # p-value at 1e-6, but that is because the observed value is # unachievable given the existsing severity scores. In practice the # observed score will always be theoretically achieveable in the null # distribution, since the observed score is calculated from the # existsing scores. p = analyse(rates, severity, 20, 1, iterations=100000) self.assertAlmostEqual(p, 1e-6, places=4)
def test_analyse_empty(self): ''' check we raise an error if the rates and severity are empty ''' with self.assertRaises(ValueError): analyse(WeightedChoice(), [], 8, 1, iterations=10000)