Exemplo n.º 1
0
    def test_euro_problem(self):
        """
    test_euro_problem (irrealis_bayes.tests.FunctionalTestPMF)

    From Think Bayes:

      In Information Theory, Inference, and Learning Algorithms, David MacKay
      poses this problem:

        A statistical statement appeared in “The Guardian" on Friday January 4,
        2002:

          When spun on edge 250 times, a Belgian one-euro coin came up heads
          140 times and tails 110. ‘It looks very suspicious to me,’ said Barry
          Blight, a statistics lecturer at the London School of Economics. ‘If
          the coin were unbiased, the chance of getting a result as extreme as
          that would be less than 7%.’

        But do these data give evidence that the coin is biased rather than
        fair?

      To answer that question, we’ll proceed in two steps. The first is to
      esti- mate the probability that the coin lands face up. The second is to
      evaluate whether the data support the hypothesis that the coin is biased.
      
      Any given coin has some probability, x, of landing heads up when spun on
      edge. It seems reasonable to believe that the value of x depends on some
      physical characteristics of the coin, primarily the distribution of
      weight.
      
      If a coin is perfectly balanced, we expect x to be close to 50%, but for
      a lop- sided coin, x might be substantially different. We can use Bayes’s
      theorem and the observed data to estimate x.
      
      Let’s define 101 hypotheses, where Hx is the hypothesis that the
      probability of heads is x%, for values from 0 to 100. I’ll start with a
      uniform prior where the probability of Hx is the same for all x. We’ll
      come back later to consider other priors.
    """

        class EuroProblem(PMF):
            def likelihood(self, data, given):
                return given / 100.0 if data == "H" else 1 - given / 100.0

        pmf = EuroProblem()
        pmf.uniform_dist(xrange(101))
        observations = "H" * 140 + "T" * 110
        for observation in observations:
            pmf.update(observation)

        self.assertTrue(55.95 < pmf.expectation() < 55.96)
        cdf = CDF(pmf)
        self.assertEqual((51, 61), cdf.percentiles(0.05, 0.95))
Exemplo n.º 2
0
    def test_euro_problem(self):
        '''
    test_euro_problem (irrealis_bayes.tests.FunctionalTestPMF)

    From Think Bayes:

      In Information Theory, Inference, and Learning Algorithms, David MacKay
      poses this problem:

        A statistical statement appeared in “The Guardian" on Friday January 4,
        2002:

          When spun on edge 250 times, a Belgian one-euro coin came up heads
          140 times and tails 110. ‘It looks very suspicious to me,’ said Barry
          Blight, a statistics lecturer at the London School of Economics. ‘If
          the coin were unbiased, the chance of getting a result as extreme as
          that would be less than 7%.’

        But do these data give evidence that the coin is biased rather than
        fair?

      To answer that question, we’ll proceed in two steps. The first is to
      esti- mate the probability that the coin lands face up. The second is to
      evaluate whether the data support the hypothesis that the coin is biased.
      
      Any given coin has some probability, x, of landing heads up when spun on
      edge. It seems reasonable to believe that the value of x depends on some
      physical characteristics of the coin, primarily the distribution of
      weight.
      
      If a coin is perfectly balanced, we expect x to be close to 50%, but for
      a lop- sided coin, x might be substantially different. We can use Bayes’s
      theorem and the observed data to estimate x.
      
      Let’s define 101 hypotheses, where Hx is the hypothesis that the
      probability of heads is x%, for values from 0 to 100. I’ll start with a
      uniform prior where the probability of Hx is the same for all x. We’ll
      come back later to consider other priors.
    '''
        class EuroProblem(PMF):
            def likelihood(self, data, given):
                return given / 100. if data == "H" else 1 - given / 100.

        pmf = EuroProblem()
        pmf.uniform_dist(xrange(101))
        observations = 'H' * 140 + 'T' * 110
        for observation in observations:
            pmf.update(observation)

        self.assertTrue(55.95 < pmf.expectation() < 55.96)
        cdf = CDF(pmf)
        self.assertEqual((51, 61), cdf.percentiles(0.05, 0.95))
Exemplo n.º 3
0
class TestCDF(unittest.TestCase):
    def setUp(self):
        self.pmf = PMF()
        self.pmf.uniform_dist("abcde")
        self.cdf = CDF(self.pmf)

    def test_percentile(self):
        self.assertEqual("a", self.cdf.percentile(0.0))
        self.assertEqual("a", self.cdf.percentile(0.1))
        self.assertEqual("a", self.cdf.percentile(0.2))
        self.assertEqual("b", self.cdf.percentile(0.3))
        self.assertEqual("b", self.cdf.percentile(0.4))
        self.assertEqual("c", self.cdf.percentile(0.5))
        self.assertEqual("c", self.cdf.percentile(0.6))
        self.assertEqual("d", self.cdf.percentile(0.7))
        self.assertEqual("d", self.cdf.percentile(0.8))
        self.assertEqual("e", self.cdf.percentile(0.9))
        self.assertEqual("e", self.cdf.percentile(1.0))

    def test_percentiles(self):
        self.assertEqual(("b", "d"), self.cdf.percentiles(0.3, 0.8))
Exemplo n.º 4
0
class TestCDF(unittest.TestCase):
    def setUp(self):
        self.pmf = PMF()
        self.pmf.uniform_dist('abcde')
        self.cdf = CDF(self.pmf)

    def test_percentile(self):
        self.assertEqual('a', self.cdf.percentile(0.0))
        self.assertEqual('a', self.cdf.percentile(0.1))
        self.assertEqual('a', self.cdf.percentile(0.2))
        self.assertEqual('b', self.cdf.percentile(0.3))
        self.assertEqual('b', self.cdf.percentile(0.4))
        self.assertEqual('c', self.cdf.percentile(0.5))
        self.assertEqual('c', self.cdf.percentile(0.6))
        self.assertEqual('d', self.cdf.percentile(0.7))
        self.assertEqual('d', self.cdf.percentile(0.8))
        self.assertEqual('e', self.cdf.percentile(0.9))
        self.assertEqual('e', self.cdf.percentile(1.0))

    def test_percentiles(self):
        self.assertEqual(('b', 'd'), self.cdf.percentiles(0.3, 0.8))
Exemplo n.º 5
0
    def test_german_tank_problem(self):
        """
    test_german_tank_problem (irrealis_bayes.tests.FunctionalTestPMF)

    From Think Bayes:

      During World War II, the Economic Warfare Division of the American
      Embassy in London used statistical analysis to estimate German production
      of tanks and other equipment.

      The Western Allies had captured log books, inventories, and repair records
      that included chassis and engine serial numbers for individual tanks.

      Analysis of these records indicated that serial numbers were allocated by
      manufacturer and tank type in blocks of 100 numbers, that numbers in each
      block were used sequentially, and that not all numbers in each block were
      used. So the problem of estimating German tank production could be
      reduced, within each block of 100 numbers, to a form of the locomotive
      problem.

      Based on this insight, American and British analysts produced estimates
      substantially lower than estimates from other forms of intelligence. And
      after the war, records indicated that they were substantially more
      accurate.

      They performed similar analyses for tires, trucks, rockets, and other
      equipment, yielding accurate and actionable economic intelligence.

      The German tank problem is historically intersting; it is also a nice
      example of real-world application of statistical estimation.

    Let's try a simplified version of the this problem. Let's assume five
    producers A, B, C, D produced 10, 30, 100, 300 tanks each in a given time
    period, and that serial number blocks were allocated and used as follows:
    
      Producer    allocated     Used       Subtotal
      A           0-99          0-9        10

      B           100-199       100-129    30

      C           200-299       200-242    43
      C           300-399       300-356    57

      D           400-499       400-465    66
      D           500-599       500-583    84
      D           600-699       600-670    71
      D           700-799       700-778    79

    Now let's pretend we don't know how many tanks were made, nor which serial
    numbers used, and then try to infer the total number of tanks on the basis
    of serial numbers observed.
    """
        # First we'll create a distribution for sampling. This distribution will be
        # uniform over the serial numbers used.
        serial_number_blocks = (
            (0, 9),
            (100, 129),
            (200, 242),
            (300, 356),
            (400, 465),
            (500, 583),
            (600, 670),
            (700, 778),
        )
        # Make a list of all actual serial numbers.
        serial_numbers = sum((range(start, end + 1) for (start, end) in serial_number_blocks), [])
        sampling_dist = PMF()
        sampling_dist.uniform_dist(serial_numbers)

        # Pretending we don't know much, we'll assume a set of ten blocks of 100
        # serial numbers per block, treating each block as in the locomotive
        # problem. We'll use a modified power distribution that includes the
        # hypothesis that zero serial numbers were used in a given block.
        class LocomotiveProblem(PMF):
            def likelihood(self, data, given):
                return 1.0 / given if 0 <= data < given else 0

        pmfs = [LocomotiveProblem() for n in range(10)]
        for pmf in pmfs:
            pmf.power_law_dist(range(1, 100))
            # The following heavily biases prior distributions toward zero. Have to
            # renormalize after this hack.
            # pmf[0] = 100.; pmf.normalize()

        # Now let's make a bunch of observations, and update our pmfs accordingly.
        random.seed(0)
        for n in range(20):
            observation = sampling_dist.random()
            pmf_number, pmf_partial_serial_number = divmod(observation, 100)
            pmf = pmfs[pmf_number]
            pmf.update(pmf_partial_serial_number)

        print
        # First thing we can try is summing expectations.
        print "sum of expectations:", sum(pmf.expectation() for pmf in pmfs)

        # Second thing we can try is summing endpoints of credible intervals. I
        # think that if I want a final 90% credible interval, I need my individual
        # credible intervals to have probability 0.9**(1./10.).
        cdfs = [CDF(pmf) for pmf in pmfs]
        credible_intervals = [cdf.percentiles(0.005, 0.995) for cdf in cdfs]
        endpoint_arrays = zip(*credible_intervals)
        summed_credible_interval = [sum(array) for array in endpoint_arrays]
        print "90% summed_credible_interval:", summed_credible_interval

        # Third thingwe can try is distribution of sums.
        sum_pmf = sum_independent_pmfs(pmfs)
        print "expectation of sum:", sum_pmf.expectation()
        sum_cdf = CDF(sum_pmf)
        credible_interval_of_sum = sum_cdf.percentiles(0.05, 0.95)
        print "90% credible interval of sum:", credible_interval_of_sum
        credible_interval_of_sum = sum_cdf.percentiles(0.025, 0.975)
        print "95% credible interval of sum:", credible_interval_of_sum
Exemplo n.º 6
0
    def test_locomotive_problem(self):
        """
    test_locomotive_problem (irrealis_bayes.tests.FunctionalTestPMF)

    From Think Bayes:
    
      "A railroad numbers its locomotives in order 1..N. One day you see a
      locomotive with the number 60. Estimate how many locomotives the railroad
      has."

      Based on this observation, we know the railroad has 60 or more
      locomotives. But how many more? To apply Bayesian reasoning, we can break
      this problem into two steps:
      - What did we know about N before we saw the data?
      - For any given value of N, what is the likelihood of seeing the data (a
        locomotive with the number 60)?

      The answer to the first question is the prior. The answer to the second
      is the likelihood.
    """
        # The likelihood function is identical to that of the dice problem.
        class LocomotiveProblem(PMF):
            def likelihood(self, data, given):
                return 0 if given < data else 1.0 / given

        # We don't have much basis to choose a prior, but we can start with
        # something simple and then consider alternatives. Let's assume that N is
        # equally likely to be any value from 1 to 1000.
        pmf = LocomotiveProblem()
        pmf.uniform_dist(xrange(1, 1001))
        pmf.update(60)
        most_likely_hypothesis, max_likelihood = max(pmf.iteritems(), key=lambda x: x[1])

        # The most likely hypothesis is 60 locomotives. That might not seem like a
        # very good guess; after all, what are the chances that you just happened
        # to see the train with the highest number? Nevertheless, if you want to
        # maximize the chance of getting the number exactly right, you should guess
        # 60:
        self.assertEqual(60, most_likely_hypothesis)
        self.assertTrue(0.005 < max_likelihood < 0.006)

        # That might not be the right goal. An alternative is to compute the
        # expectation of the posterior distribution:
        self.assertTrue(333 < pmf.expectation() < 334)

        # But if we use an upper bound of 500, we get a posterior expectation of
        # 207, and if we use an upper bound of 2000, we get a posterior expectation
        # of 552, which is bad:
        pmf.uniform_dist(xrange(1, 501))
        pmf.update(60)
        self.assertTrue(207 < pmf.expectation() < 208)
        pmf.uniform_dist(xrange(1, 2001))
        pmf.update(60)
        self.assertTrue(552 < pmf.expectation() < 553)

        # With more data, the expectations begin to converge:
        pmf.uniform_dist(xrange(1, 501))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(151 < pmf.expectation() < 152)
        pmf.uniform_dist(xrange(1, 1001))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(164 < pmf.expectation() < 165)
        pmf.uniform_dist(xrange(1, 2001))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(171 < pmf.expectation() < 172)

        # Alternatively, with better estimates of prior distributions, the
        # expectations also converge. Downey observes a report by Axtell in Science
        # (http://www.sciencemag.org/content/293/5536/1818.full.pdf) that the
        # distribution of company sizes tends to follow a power law. So instead of
        # using uniform distributions of priors, we can try power-law
        # distributions:
        pmf.power_law_dist(xrange(1, 501))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(130 < pmf.expectation() < 131)
        pmf.power_law_dist(xrange(1, 1001))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(133 < pmf.expectation() < 134)
        pmf.power_law_dist(xrange(1, 2001))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(133 < pmf.expectation() < 134)
        # The expectations are now in  close agreement.

        # We can determine a credible interval for which there is a 90% chance that
        # the answer (how many locomotives the railroad has) lies within the
        # interval:
        cdf = CDF(pmf)
        self.assertEqual((91, 243), cdf.percentiles(0.05, 0.95))
Exemplo n.º 7
0
 def setUp(self):
     self.pmf = PMF()
     self.pmf.uniform_dist("abcde")
     self.cdf = CDF(self.pmf)
Exemplo n.º 8
0
    def test_german_tank_problem(self):
        '''
    test_german_tank_problem (irrealis_bayes.tests.FunctionalTestPMF)

    From Think Bayes:

      During World War II, the Economic Warfare Division of the American
      Embassy in London used statistical analysis to estimate German production
      of tanks and other equipment.

      The Western Allies had captured log books, inventories, and repair records
      that included chassis and engine serial numbers for individual tanks.

      Analysis of these records indicated that serial numbers were allocated by
      manufacturer and tank type in blocks of 100 numbers, that numbers in each
      block were used sequentially, and that not all numbers in each block were
      used. So the problem of estimating German tank production could be
      reduced, within each block of 100 numbers, to a form of the locomotive
      problem.

      Based on this insight, American and British analysts produced estimates
      substantially lower than estimates from other forms of intelligence. And
      after the war, records indicated that they were substantially more
      accurate.

      They performed similar analyses for tires, trucks, rockets, and other
      equipment, yielding accurate and actionable economic intelligence.

      The German tank problem is historically intersting; it is also a nice
      example of real-world application of statistical estimation.

    Let's try a simplified version of the this problem. Let's assume five
    producers A, B, C, D produced 10, 30, 100, 300 tanks each in a given time
    period, and that serial number blocks were allocated and used as follows:
    
      Producer    allocated     Used       Subtotal
      A           0-99          0-9        10

      B           100-199       100-129    30

      C           200-299       200-242    43
      C           300-399       300-356    57

      D           400-499       400-465    66
      D           500-599       500-583    84
      D           600-699       600-670    71
      D           700-799       700-778    79

    Now let's pretend we don't know how many tanks were made, nor which serial
    numbers used, and then try to infer the total number of tanks on the basis
    of serial numbers observed.
    '''
        # First we'll create a distribution for sampling. This distribution will be
        # uniform over the serial numbers used.
        serial_number_blocks = (
            (0, 9),
            (100, 129),
            (200, 242),
            (300, 356),
            (400, 465),
            (500, 583),
            (600, 670),
            (700, 778),
        )
        # Make a list of all actual serial numbers.
        serial_numbers = sum((range(start, end + 1)
                              for (start, end) in serial_number_blocks), [])
        sampling_dist = PMF()
        sampling_dist.uniform_dist(serial_numbers)

        # Pretending we don't know much, we'll assume a set of ten blocks of 100
        # serial numbers per block, treating each block as in the locomotive
        # problem. We'll use a modified power distribution that includes the
        # hypothesis that zero serial numbers were used in a given block.
        class LocomotiveProblem(PMF):
            def likelihood(self, data, given):
                return 1. / given if 0 <= data < given else 0

        pmfs = [LocomotiveProblem() for n in range(10)]
        for pmf in pmfs:
            pmf.power_law_dist(range(1, 100))
            # The following heavily biases prior distributions toward zero. Have to
            # renormalize after this hack.
            #pmf[0] = 100.; pmf.normalize()

        # Now let's make a bunch of observations, and update our pmfs accordingly.
        random.seed(0)
        for n in range(20):
            observation = sampling_dist.random()
            pmf_number, pmf_partial_serial_number = divmod(observation, 100)
            pmf = pmfs[pmf_number]
            pmf.update(pmf_partial_serial_number)

        print
        # First thing we can try is summing expectations.
        print "sum of expectations:", sum(pmf.expectation() for pmf in pmfs)

        # Second thing we can try is summing endpoints of credible intervals. I
        # think that if I want a final 90% credible interval, I need my individual
        # credible intervals to have probability 0.9**(1./10.).
        cdfs = [CDF(pmf) for pmf in pmfs]
        credible_intervals = [cdf.percentiles(0.005, 0.995) for cdf in cdfs]
        endpoint_arrays = zip(*credible_intervals)
        summed_credible_interval = [sum(array) for array in endpoint_arrays]
        print "90% summed_credible_interval:", summed_credible_interval

        # Third thingwe can try is distribution of sums.
        sum_pmf = sum_independent_pmfs(pmfs)
        print "expectation of sum:", sum_pmf.expectation()
        sum_cdf = CDF(sum_pmf)
        credible_interval_of_sum = sum_cdf.percentiles(0.05, 0.95)
        print "90% credible interval of sum:", credible_interval_of_sum
        credible_interval_of_sum = sum_cdf.percentiles(0.025, 0.975)
        print "95% credible interval of sum:", credible_interval_of_sum
Exemplo n.º 9
0
    def test_locomotive_problem(self):
        '''
    test_locomotive_problem (irrealis_bayes.tests.FunctionalTestPMF)

    From Think Bayes:
    
      "A railroad numbers its locomotives in order 1..N. One day you see a
      locomotive with the number 60. Estimate how many locomotives the railroad
      has."

      Based on this observation, we know the railroad has 60 or more
      locomotives. But how many more? To apply Bayesian reasoning, we can break
      this problem into two steps:
      - What did we know about N before we saw the data?
      - For any given value of N, what is the likelihood of seeing the data (a
        locomotive with the number 60)?

      The answer to the first question is the prior. The answer to the second
      is the likelihood.
    '''

        # The likelihood function is identical to that of the dice problem.
        class LocomotiveProblem(PMF):
            def likelihood(self, data, given):
                return 0 if given < data else 1. / given

        # We don't have much basis to choose a prior, but we can start with
        # something simple and then consider alternatives. Let's assume that N is
        # equally likely to be any value from 1 to 1000.
        pmf = LocomotiveProblem()
        pmf.uniform_dist(xrange(1, 1001))
        pmf.update(60)
        most_likely_hypothesis, max_likelihood = max(pmf.iteritems(),
                                                     key=lambda x: x[1])

        # The most likely hypothesis is 60 locomotives. That might not seem like a
        # very good guess; after all, what are the chances that you just happened
        # to see the train with the highest number? Nevertheless, if you want to
        # maximize the chance of getting the number exactly right, you should guess
        # 60:
        self.assertEqual(60, most_likely_hypothesis)
        self.assertTrue(0.005 < max_likelihood < 0.006)

        # That might not be the right goal. An alternative is to compute the
        # expectation of the posterior distribution:
        self.assertTrue(333 < pmf.expectation() < 334)

        # But if we use an upper bound of 500, we get a posterior expectation of
        # 207, and if we use an upper bound of 2000, we get a posterior expectation
        # of 552, which is bad:
        pmf.uniform_dist(xrange(1, 501))
        pmf.update(60)
        self.assertTrue(207 < pmf.expectation() < 208)
        pmf.uniform_dist(xrange(1, 2001))
        pmf.update(60)
        self.assertTrue(552 < pmf.expectation() < 553)

        # With more data, the expectations begin to converge:
        pmf.uniform_dist(xrange(1, 501))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(151 < pmf.expectation() < 152)
        pmf.uniform_dist(xrange(1, 1001))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(164 < pmf.expectation() < 165)
        pmf.uniform_dist(xrange(1, 2001))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(171 < pmf.expectation() < 172)

        # Alternatively, with better estimates of prior distributions, the
        # expectations also converge. Downey observes a report by Axtell in Science
        # (http://www.sciencemag.org/content/293/5536/1818.full.pdf) that the
        # distribution of company sizes tends to follow a power law. So instead of
        # using uniform distributions of priors, we can try power-law
        # distributions:
        pmf.power_law_dist(xrange(1, 501))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(130 < pmf.expectation() < 131)
        pmf.power_law_dist(xrange(1, 1001))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(133 < pmf.expectation() < 134)
        pmf.power_law_dist(xrange(1, 2001))
        for n in (60, 30, 90):
            pmf.update(n)
        self.assertTrue(133 < pmf.expectation() < 134)
        # The expectations are now in  close agreement.

        # We can determine a credible interval for which there is a 90% chance that
        # the answer (how many locomotives the railroad has) lies within the
        # interval:
        cdf = CDF(pmf)
        self.assertEqual((91, 243), cdf.percentiles(0.05, 0.95))
Exemplo n.º 10
0
 def setUp(self):
     self.pmf = PMF()
     self.pmf.uniform_dist('abcde')
     self.cdf = CDF(self.pmf)