示例#1
0
    def testTTestEqualSamples(self):
        """Checks that t = 0 and p = 1 when the samples are the same."""
        t, _, p = ttest.WelchsTTest([1, 2, 3], [1, 2, 3])
        self.assertEqual(0, t)
        self.assertEqual(1, p)

        t, _, p = ttest.WelchsTTest([1, 2], [1, 2])
        self.assertEqual(0, t)
        self.assertEqual(1, p)
示例#2
0
  def ConfidenceScore(sample1, sample2, accept_single_bad_or_good=False):
    """Calculates a confidence score.

    This score is based on a statistical hypothesis test. The null
    hypothesis is that the two groups of results have no difference,
    i.e. there is no performance regression. The alternative hypothesis
    is that there is some difference between the groups that's unlikely
    to occur by chance.

    The score returned by this function represents our confidence in the
    alternative hypothesis.

    Note that if there's only one item in either sample, this means only
    one revision was classified good or bad, so there's not much evidence
    to make a decision.

    Args:
      sample1: A flat list of "good" result numbers.
      sample2: A flat list of "bad" result numbers.
      accept_single_bad_or_good: If True, compute a value even if
          there is only one bad or good revision.

    Returns:
      A float between 0 and 100; 0 if the samples aren't large enough.
    """
    if ((len(sample1) <= 1 or len(sample2) <= 1) and
        not accept_single_bad_or_good):
      return 0.0
    if not sample1 or not sample2:
      return 0.0
    _, _, p_value = ttest.WelchsTTest(sample1, sample2)
    return 100.0 * (1.0 - p_value)
def ConfidenceScore(good_results_lists, bad_results_lists):
    """Calculates a confidence score.

  This score is a percentage which represents our degree of confidence in the
  proposition that the good results and bad results are distinct groups, and
  their differences aren't due to chance alone.


  Args:
    good_results_lists: A list of lists of "good" result numbers.
    bad_results_lists: A list of lists of "bad" result numbers.

  Returns:
    A number in the range [0, 100].
  """
    # If there's only one item in either list, this means only one revision was
    # classified good or bad; this isn't good enough evidence to make a decision.
    # If an empty list was passed, that also implies zero confidence.
    if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
        return 0.0

    # Flatten the lists of results lists.
    sample1 = sum(good_results_lists, [])
    sample2 = sum(bad_results_lists, [])

    # If there were only empty lists in either of the lists (this is unexpected
    # and normally shouldn't happen), then we also want to return 0.
    if not sample1 or not sample2:
        return 0.0

    # The p-value is approximately the probability of obtaining the given set
    # of good and bad values just by chance.
    _, _, p_value = ttest.WelchsTTest(sample1, sample2)
    return 100.0 * (1.0 - p_value)
示例#4
0
    def testWelchsTTest(self):
        """Tests the t value and degrees of freedom output of Welch's t-test."""
        # The t-value can be checked with scipy.stats.ttest_ind(equal_var=False).
        t, df, _ = ttest.WelchsTTest([2, 3, 2, 3, 2, 3], [4, 5, 4, 5, 4, 5])
        self.assertAlmostEqual(10.0, df)

        # The t-value produced by scipy.stats.ttest_ind is -6.32455532034.
        # Our function produces slightly different results.
        # Possibly due to differences in rounding error?
        self.assertAlmostEqual(-6.325, t, delta=1.0)
示例#5
0
    def ConfidenceScore(sample1, sample2, accept_single_bad_or_good=False):
        """Calculates a confidence score.

    This score is a percentage which represents our degree of confidence in the
    proposition that the good results and bad results are distinct groups, and
    their differences aren't due to chance alone.


    Args:
      sample1: A flat list of "good" result numbers.
      sample2: A flat list of "bad" result numbers.
      accept_single_bad_or_good: If True, computes confidence even if there is
          just one bad or good revision, otherwise single good or bad revision
          always returns 0.0 confidence. This flag will probably get away when
          we will implement expanding the bisect range by one more revision for
          such case.

    Returns:
      A number in the range [0, 100].
    """
        # If there's only one item in either list, this means only one revision was
        # classified good or bad; this isn't good enough evidence to make a
        # decision. If an empty list was passed, that also implies zero confidence.
        if not accept_single_bad_or_good:
            if len(sample1) <= 1 or len(sample2) <= 1:
                return 0.0

        # If there were only empty lists in either of the lists (this is unexpected
        # and normally shouldn't happen), then we also want to return 0.
        if not sample1 or not sample2:
            return 0.0

        # The p-value is approximately the probability of obtaining the given set
        # of good and bad values just by chance.
        _, _, p_value = ttest.WelchsTTest(sample1, sample2)
        return 100.0 * (1.0 - p_value)
示例#6
0
 def testTTestMeanDifference(self):
     """Verifies that smaller difference between means -> higher p value."""
     _, _, p_far_means = ttest.WelchsTTest([2, 3, 2, 3], [5, 6, 5, 6])
     _, _, p_near_means = ttest.WelchsTTest([2, 3, 2, 3], [3, 4, 3, 4])
     self.assertLess(p_far_means, p_near_means)
示例#7
0
 def testTTestSampleSize(self):
     """Verifies that smaller sample size -> higher p value."""
     _, _, p_larger_sample = ttest.WelchsTTest([2, 3, 2, 3], [4, 5, 4, 5])
     _, _, p_smaller_sample = ttest.WelchsTTest([2, 3, 2, 3], [4, 5])
     self.assertLess(p_larger_sample, p_smaller_sample)
示例#8
0
 def testTTestVariance(self):
     """Verifies that higher variance -> higher p value."""
     _, _, p_low_var = ttest.WelchsTTest([2, 3, 2, 3], [4, 5, 4, 5])
     _, _, p_high_var = ttest.WelchsTTest([1, 4, 1, 4], [3, 6, 3, 6])
     self.assertLess(p_low_var, p_high_var)
示例#9
0
 def testTTestVeryDifferentSamples(self):
     """Checks that p is very low when the samples are clearly different."""
     t, _, p = ttest.WelchsTTest([100, 101, 100, 101, 100],
                                 [1, 2, 1, 2, 1, 2, 1, 2])
     self.assertGreaterEqual(t, 250)
     self.assertLessEqual(0.01, p)