def test_iadd_pmfs(self): left_pmf, right_pmf = PMF(), PMF() left_pmf.uniform_dist((0, 1)) right_pmf.uniform_dist((0, 1)) left_pmf += right_pmf self.assertTrue(0.249 < left_pmf[0] < 0.251) self.assertTrue(0.499 < left_pmf[1] < 0.501) self.assertTrue(0.249 < left_pmf[2] < 0.251)
def test_filter_possible_events(self): pmf = PMF() pmf.uniform_dist("abcdef") pmf["f"] = 0 filtered_pmf = filter_possible_events(pmf) for x in "abcde": self.assertTrue(x in filtered_pmf) self.assertTrue(x in filtered_pmf)
def test_add_two_independent_pmfs(self): left_pmf, right_pmf = PMF(), PMF() left_pmf.uniform_dist((0, 1)) right_pmf.uniform_dist((0, 1)) sum_pmf = add_two_independent_pmfs(left_pmf, right_pmf) self.assertTrue(0.249 < sum_pmf[0] < 0.251) self.assertTrue(0.499 < sum_pmf[1] < 0.501) self.assertTrue(0.249 < sum_pmf[2] < 0.251)
def test_filter_possible_events(self): pmf = PMF() pmf.uniform_dist('abcdef') pmf['f'] = 0 filtered_pmf = filter_possible_events(pmf) for x in 'abcde': self.assertTrue(x in filtered_pmf) self.assertTrue(x in filtered_pmf)
class TestCDF(unittest.TestCase): def setUp(self): self.pmf = PMF() self.pmf.uniform_dist("abcde") self.cdf = CDF(self.pmf) def test_percentile(self): self.assertEqual("a", self.cdf.percentile(0.0)) self.assertEqual("a", self.cdf.percentile(0.1)) self.assertEqual("a", self.cdf.percentile(0.2)) self.assertEqual("b", self.cdf.percentile(0.3)) self.assertEqual("b", self.cdf.percentile(0.4)) self.assertEqual("c", self.cdf.percentile(0.5)) self.assertEqual("c", self.cdf.percentile(0.6)) self.assertEqual("d", self.cdf.percentile(0.7)) self.assertEqual("d", self.cdf.percentile(0.8)) self.assertEqual("e", self.cdf.percentile(0.9)) self.assertEqual("e", self.cdf.percentile(1.0)) def test_percentiles(self): self.assertEqual(("b", "d"), self.cdf.percentiles(0.3, 0.8))
class TestCDF(unittest.TestCase): def setUp(self): self.pmf = PMF() self.pmf.uniform_dist('abcde') self.cdf = CDF(self.pmf) def test_percentile(self): self.assertEqual('a', self.cdf.percentile(0.0)) self.assertEqual('a', self.cdf.percentile(0.1)) self.assertEqual('a', self.cdf.percentile(0.2)) self.assertEqual('b', self.cdf.percentile(0.3)) self.assertEqual('b', self.cdf.percentile(0.4)) self.assertEqual('c', self.cdf.percentile(0.5)) self.assertEqual('c', self.cdf.percentile(0.6)) self.assertEqual('d', self.cdf.percentile(0.7)) self.assertEqual('d', self.cdf.percentile(0.8)) self.assertEqual('e', self.cdf.percentile(0.9)) self.assertEqual('e', self.cdf.percentile(1.0)) def test_percentiles(self): self.assertEqual(('b', 'd'), self.cdf.percentiles(0.3, 0.8))
def test_german_tank_problem(self): """ test_german_tank_problem (irrealis_bayes.tests.FunctionalTestPMF) From Think Bayes: During World War II, the Economic Warfare Division of the American Embassy in London used statistical analysis to estimate German production of tanks and other equipment. The Western Allies had captured log books, inventories, and repair records that included chassis and engine serial numbers for individual tanks. Analysis of these records indicated that serial numbers were allocated by manufacturer and tank type in blocks of 100 numbers, that numbers in each block were used sequentially, and that not all numbers in each block were used. So the problem of estimating German tank production could be reduced, within each block of 100 numbers, to a form of the locomotive problem. Based on this insight, American and British analysts produced estimates substantially lower than estimates from other forms of intelligence. And after the war, records indicated that they were substantially more accurate. They performed similar analyses for tires, trucks, rockets, and other equipment, yielding accurate and actionable economic intelligence. The German tank problem is historically intersting; it is also a nice example of real-world application of statistical estimation. Let's try a simplified version of the this problem. Let's assume five producers A, B, C, D produced 10, 30, 100, 300 tanks each in a given time period, and that serial number blocks were allocated and used as follows: Producer allocated Used Subtotal A 0-99 0-9 10 B 100-199 100-129 30 C 200-299 200-242 43 C 300-399 300-356 57 D 400-499 400-465 66 D 500-599 500-583 84 D 600-699 600-670 71 D 700-799 700-778 79 Now let's pretend we don't know how many tanks were made, nor which serial numbers used, and then try to infer the total number of tanks on the basis of serial numbers observed. """ # First we'll create a distribution for sampling. This distribution will be # uniform over the serial numbers used. serial_number_blocks = ( (0, 9), (100, 129), (200, 242), (300, 356), (400, 465), (500, 583), (600, 670), (700, 778), ) # Make a list of all actual serial numbers. serial_numbers = sum((range(start, end + 1) for (start, end) in serial_number_blocks), []) sampling_dist = PMF() sampling_dist.uniform_dist(serial_numbers) # Pretending we don't know much, we'll assume a set of ten blocks of 100 # serial numbers per block, treating each block as in the locomotive # problem. We'll use a modified power distribution that includes the # hypothesis that zero serial numbers were used in a given block. class LocomotiveProblem(PMF): def likelihood(self, data, given): return 1.0 / given if 0 <= data < given else 0 pmfs = [LocomotiveProblem() for n in range(10)] for pmf in pmfs: pmf.power_law_dist(range(1, 100)) # The following heavily biases prior distributions toward zero. Have to # renormalize after this hack. # pmf[0] = 100.; pmf.normalize() # Now let's make a bunch of observations, and update our pmfs accordingly. random.seed(0) for n in range(20): observation = sampling_dist.random() pmf_number, pmf_partial_serial_number = divmod(observation, 100) pmf = pmfs[pmf_number] pmf.update(pmf_partial_serial_number) print # First thing we can try is summing expectations. print "sum of expectations:", sum(pmf.expectation() for pmf in pmfs) # Second thing we can try is summing endpoints of credible intervals. I # think that if I want a final 90% credible interval, I need my individual # credible intervals to have probability 0.9**(1./10.). cdfs = [CDF(pmf) for pmf in pmfs] credible_intervals = [cdf.percentiles(0.005, 0.995) for cdf in cdfs] endpoint_arrays = zip(*credible_intervals) summed_credible_interval = [sum(array) for array in endpoint_arrays] print "90% summed_credible_interval:", summed_credible_interval # Third thingwe can try is distribution of sums. sum_pmf = sum_independent_pmfs(pmfs) print "expectation of sum:", sum_pmf.expectation() sum_cdf = CDF(sum_pmf) credible_interval_of_sum = sum_cdf.percentiles(0.05, 0.95) print "90% credible interval of sum:", credible_interval_of_sum credible_interval_of_sum = sum_cdf.percentiles(0.025, 0.975) print "95% credible interval of sum:", credible_interval_of_sum
def test_german_tank_problem(self): ''' test_german_tank_problem (irrealis_bayes.tests.FunctionalTestPMF) From Think Bayes: During World War II, the Economic Warfare Division of the American Embassy in London used statistical analysis to estimate German production of tanks and other equipment. The Western Allies had captured log books, inventories, and repair records that included chassis and engine serial numbers for individual tanks. Analysis of these records indicated that serial numbers were allocated by manufacturer and tank type in blocks of 100 numbers, that numbers in each block were used sequentially, and that not all numbers in each block were used. So the problem of estimating German tank production could be reduced, within each block of 100 numbers, to a form of the locomotive problem. Based on this insight, American and British analysts produced estimates substantially lower than estimates from other forms of intelligence. And after the war, records indicated that they were substantially more accurate. They performed similar analyses for tires, trucks, rockets, and other equipment, yielding accurate and actionable economic intelligence. The German tank problem is historically intersting; it is also a nice example of real-world application of statistical estimation. Let's try a simplified version of the this problem. Let's assume five producers A, B, C, D produced 10, 30, 100, 300 tanks each in a given time period, and that serial number blocks were allocated and used as follows: Producer allocated Used Subtotal A 0-99 0-9 10 B 100-199 100-129 30 C 200-299 200-242 43 C 300-399 300-356 57 D 400-499 400-465 66 D 500-599 500-583 84 D 600-699 600-670 71 D 700-799 700-778 79 Now let's pretend we don't know how many tanks were made, nor which serial numbers used, and then try to infer the total number of tanks on the basis of serial numbers observed. ''' # First we'll create a distribution for sampling. This distribution will be # uniform over the serial numbers used. serial_number_blocks = ( (0, 9), (100, 129), (200, 242), (300, 356), (400, 465), (500, 583), (600, 670), (700, 778), ) # Make a list of all actual serial numbers. serial_numbers = sum((range(start, end + 1) for (start, end) in serial_number_blocks), []) sampling_dist = PMF() sampling_dist.uniform_dist(serial_numbers) # Pretending we don't know much, we'll assume a set of ten blocks of 100 # serial numbers per block, treating each block as in the locomotive # problem. We'll use a modified power distribution that includes the # hypothesis that zero serial numbers were used in a given block. class LocomotiveProblem(PMF): def likelihood(self, data, given): return 1. / given if 0 <= data < given else 0 pmfs = [LocomotiveProblem() for n in range(10)] for pmf in pmfs: pmf.power_law_dist(range(1, 100)) # The following heavily biases prior distributions toward zero. Have to # renormalize after this hack. #pmf[0] = 100.; pmf.normalize() # Now let's make a bunch of observations, and update our pmfs accordingly. random.seed(0) for n in range(20): observation = sampling_dist.random() pmf_number, pmf_partial_serial_number = divmod(observation, 100) pmf = pmfs[pmf_number] pmf.update(pmf_partial_serial_number) print # First thing we can try is summing expectations. print "sum of expectations:", sum(pmf.expectation() for pmf in pmfs) # Second thing we can try is summing endpoints of credible intervals. I # think that if I want a final 90% credible interval, I need my individual # credible intervals to have probability 0.9**(1./10.). cdfs = [CDF(pmf) for pmf in pmfs] credible_intervals = [cdf.percentiles(0.005, 0.995) for cdf in cdfs] endpoint_arrays = zip(*credible_intervals) summed_credible_interval = [sum(array) for array in endpoint_arrays] print "90% summed_credible_interval:", summed_credible_interval # Third thingwe can try is distribution of sums. sum_pmf = sum_independent_pmfs(pmfs) print "expectation of sum:", sum_pmf.expectation() sum_cdf = CDF(sum_pmf) credible_interval_of_sum = sum_cdf.percentiles(0.05, 0.95) print "90% credible interval of sum:", credible_interval_of_sum credible_interval_of_sum = sum_cdf.percentiles(0.025, 0.975) print "95% credible interval of sum:", credible_interval_of_sum