Пример #1
0
 def test_sum_three_pmfs(self):
     pmfs = [PMF.fromkeys((0, 1), 0.5) for n in range(3)]
     sum_pmf = sum_independent_pmfs(pmfs)
     self.assertTrue(0.124 < sum_pmf[0] < 0.126)
     self.assertTrue(0.374 < sum_pmf[1] < 0.376)
     self.assertTrue(0.374 < sum_pmf[2] < 0.376)
     self.assertTrue(0.124 < sum_pmf[3] < 0.126)
Пример #2
0
 def test_sum_three_pmfs(self):
     pmfs = [PMF.fromkeys((0, 1), 0.5) for n in range(3)]
     sum_pmf = sum_independent_pmfs(pmfs)
     self.assertTrue(0.124 < sum_pmf[0] < 0.126)
     self.assertTrue(0.374 < sum_pmf[1] < 0.376)
     self.assertTrue(0.374 < sum_pmf[2] < 0.376)
     self.assertTrue(0.124 < sum_pmf[3] < 0.126)
Пример #3
0
    def test_german_tank_problem(self):
        """
    test_german_tank_problem (irrealis_bayes.tests.FunctionalTestPMF)

    From Think Bayes:

      During World War II, the Economic Warfare Division of the American
      Embassy in London used statistical analysis to estimate German production
      of tanks and other equipment.

      The Western Allies had captured log books, inventories, and repair records
      that included chassis and engine serial numbers for individual tanks.

      Analysis of these records indicated that serial numbers were allocated by
      manufacturer and tank type in blocks of 100 numbers, that numbers in each
      block were used sequentially, and that not all numbers in each block were
      used. So the problem of estimating German tank production could be
      reduced, within each block of 100 numbers, to a form of the locomotive
      problem.

      Based on this insight, American and British analysts produced estimates
      substantially lower than estimates from other forms of intelligence. And
      after the war, records indicated that they were substantially more
      accurate.

      They performed similar analyses for tires, trucks, rockets, and other
      equipment, yielding accurate and actionable economic intelligence.

      The German tank problem is historically intersting; it is also a nice
      example of real-world application of statistical estimation.

    Let's try a simplified version of the this problem. Let's assume five
    producers A, B, C, D produced 10, 30, 100, 300 tanks each in a given time
    period, and that serial number blocks were allocated and used as follows:
    
      Producer    allocated     Used       Subtotal
      A           0-99          0-9        10

      B           100-199       100-129    30

      C           200-299       200-242    43
      C           300-399       300-356    57

      D           400-499       400-465    66
      D           500-599       500-583    84
      D           600-699       600-670    71
      D           700-799       700-778    79

    Now let's pretend we don't know how many tanks were made, nor which serial
    numbers used, and then try to infer the total number of tanks on the basis
    of serial numbers observed.
    """
        # First we'll create a distribution for sampling. This distribution will be
        # uniform over the serial numbers used.
        serial_number_blocks = (
            (0, 9),
            (100, 129),
            (200, 242),
            (300, 356),
            (400, 465),
            (500, 583),
            (600, 670),
            (700, 778),
        )
        # Make a list of all actual serial numbers.
        serial_numbers = sum((range(start, end + 1) for (start, end) in serial_number_blocks), [])
        sampling_dist = PMF()
        sampling_dist.uniform_dist(serial_numbers)

        # Pretending we don't know much, we'll assume a set of ten blocks of 100
        # serial numbers per block, treating each block as in the locomotive
        # problem. We'll use a modified power distribution that includes the
        # hypothesis that zero serial numbers were used in a given block.
        class LocomotiveProblem(PMF):
            def likelihood(self, data, given):
                return 1.0 / given if 0 <= data < given else 0

        pmfs = [LocomotiveProblem() for n in range(10)]
        for pmf in pmfs:
            pmf.power_law_dist(range(1, 100))
            # The following heavily biases prior distributions toward zero. Have to
            # renormalize after this hack.
            # pmf[0] = 100.; pmf.normalize()

        # Now let's make a bunch of observations, and update our pmfs accordingly.
        random.seed(0)
        for n in range(20):
            observation = sampling_dist.random()
            pmf_number, pmf_partial_serial_number = divmod(observation, 100)
            pmf = pmfs[pmf_number]
            pmf.update(pmf_partial_serial_number)

        print
        # First thing we can try is summing expectations.
        print "sum of expectations:", sum(pmf.expectation() for pmf in pmfs)

        # Second thing we can try is summing endpoints of credible intervals. I
        # think that if I want a final 90% credible interval, I need my individual
        # credible intervals to have probability 0.9**(1./10.).
        cdfs = [CDF(pmf) for pmf in pmfs]
        credible_intervals = [cdf.percentiles(0.005, 0.995) for cdf in cdfs]
        endpoint_arrays = zip(*credible_intervals)
        summed_credible_interval = [sum(array) for array in endpoint_arrays]
        print "90% summed_credible_interval:", summed_credible_interval

        # Third thingwe can try is distribution of sums.
        sum_pmf = sum_independent_pmfs(pmfs)
        print "expectation of sum:", sum_pmf.expectation()
        sum_cdf = CDF(sum_pmf)
        credible_interval_of_sum = sum_cdf.percentiles(0.05, 0.95)
        print "90% credible interval of sum:", credible_interval_of_sum
        credible_interval_of_sum = sum_cdf.percentiles(0.025, 0.975)
        print "95% credible interval of sum:", credible_interval_of_sum
Пример #4
0
 def test_sum_two_pmfs(self):
     pmfs = [PMF.fromkeys((0, 1), 0.5) for n in range(2)]
     sum_pmf = sum_independent_pmfs(pmfs)
     self.assertTrue(0.249 < sum_pmf[0] < 0.251)
     self.assertTrue(0.499 < sum_pmf[1] < 0.501)
     self.assertTrue(0.249 < sum_pmf[2] < 0.251)
Пример #5
0
    def test_german_tank_problem(self):
        '''
    test_german_tank_problem (irrealis_bayes.tests.FunctionalTestPMF)

    From Think Bayes:

      During World War II, the Economic Warfare Division of the American
      Embassy in London used statistical analysis to estimate German production
      of tanks and other equipment.

      The Western Allies had captured log books, inventories, and repair records
      that included chassis and engine serial numbers for individual tanks.

      Analysis of these records indicated that serial numbers were allocated by
      manufacturer and tank type in blocks of 100 numbers, that numbers in each
      block were used sequentially, and that not all numbers in each block were
      used. So the problem of estimating German tank production could be
      reduced, within each block of 100 numbers, to a form of the locomotive
      problem.

      Based on this insight, American and British analysts produced estimates
      substantially lower than estimates from other forms of intelligence. And
      after the war, records indicated that they were substantially more
      accurate.

      They performed similar analyses for tires, trucks, rockets, and other
      equipment, yielding accurate and actionable economic intelligence.

      The German tank problem is historically intersting; it is also a nice
      example of real-world application of statistical estimation.

    Let's try a simplified version of the this problem. Let's assume five
    producers A, B, C, D produced 10, 30, 100, 300 tanks each in a given time
    period, and that serial number blocks were allocated and used as follows:
    
      Producer    allocated     Used       Subtotal
      A           0-99          0-9        10

      B           100-199       100-129    30

      C           200-299       200-242    43
      C           300-399       300-356    57

      D           400-499       400-465    66
      D           500-599       500-583    84
      D           600-699       600-670    71
      D           700-799       700-778    79

    Now let's pretend we don't know how many tanks were made, nor which serial
    numbers used, and then try to infer the total number of tanks on the basis
    of serial numbers observed.
    '''
        # First we'll create a distribution for sampling. This distribution will be
        # uniform over the serial numbers used.
        serial_number_blocks = (
            (0, 9),
            (100, 129),
            (200, 242),
            (300, 356),
            (400, 465),
            (500, 583),
            (600, 670),
            (700, 778),
        )
        # Make a list of all actual serial numbers.
        serial_numbers = sum((range(start, end + 1)
                              for (start, end) in serial_number_blocks), [])
        sampling_dist = PMF()
        sampling_dist.uniform_dist(serial_numbers)

        # Pretending we don't know much, we'll assume a set of ten blocks of 100
        # serial numbers per block, treating each block as in the locomotive
        # problem. We'll use a modified power distribution that includes the
        # hypothesis that zero serial numbers were used in a given block.
        class LocomotiveProblem(PMF):
            def likelihood(self, data, given):
                return 1. / given if 0 <= data < given else 0

        pmfs = [LocomotiveProblem() for n in range(10)]
        for pmf in pmfs:
            pmf.power_law_dist(range(1, 100))
            # The following heavily biases prior distributions toward zero. Have to
            # renormalize after this hack.
            #pmf[0] = 100.; pmf.normalize()

        # Now let's make a bunch of observations, and update our pmfs accordingly.
        random.seed(0)
        for n in range(20):
            observation = sampling_dist.random()
            pmf_number, pmf_partial_serial_number = divmod(observation, 100)
            pmf = pmfs[pmf_number]
            pmf.update(pmf_partial_serial_number)

        print
        # First thing we can try is summing expectations.
        print "sum of expectations:", sum(pmf.expectation() for pmf in pmfs)

        # Second thing we can try is summing endpoints of credible intervals. I
        # think that if I want a final 90% credible interval, I need my individual
        # credible intervals to have probability 0.9**(1./10.).
        cdfs = [CDF(pmf) for pmf in pmfs]
        credible_intervals = [cdf.percentiles(0.005, 0.995) for cdf in cdfs]
        endpoint_arrays = zip(*credible_intervals)
        summed_credible_interval = [sum(array) for array in endpoint_arrays]
        print "90% summed_credible_interval:", summed_credible_interval

        # Third thingwe can try is distribution of sums.
        sum_pmf = sum_independent_pmfs(pmfs)
        print "expectation of sum:", sum_pmf.expectation()
        sum_cdf = CDF(sum_pmf)
        credible_interval_of_sum = sum_cdf.percentiles(0.05, 0.95)
        print "90% credible interval of sum:", credible_interval_of_sum
        credible_interval_of_sum = sum_cdf.percentiles(0.025, 0.975)
        print "95% credible interval of sum:", credible_interval_of_sum
Пример #6
0
 def test_sum_two_pmfs(self):
     pmfs = [PMF.fromkeys((0, 1), 0.5) for n in range(2)]
     sum_pmf = sum_independent_pmfs(pmfs)
     self.assertTrue(0.249 < sum_pmf[0] < 0.251)
     self.assertTrue(0.499 < sum_pmf[1] < 0.501)
     self.assertTrue(0.249 < sum_pmf[2] < 0.251)