예제 #1
0
 def testBasic(self):
     self.assertAlmostEqual(
         kolmogorov_smirnov.KolmogorovSmirnov(range(10), range(20, 30)),
         1.8879793657162556e-05)
     self.assertAlmostEqual(
         kolmogorov_smirnov.KolmogorovSmirnov(range(5), range(10)),
         0.26680230985258474)
예제 #2
0
def _CompareValues(values_a, values_b):
  """Decide whether two samples are the same, different, or unknown.

  Arguments:
    values_a: A list of sortable values. They don't need to be numeric.
    values_b: A list of sortable values. They don't need to be numeric.

  Returns:
    _DIFFERENT: The samples likely come from different distributions.
        Reject the null hypothesis.
    _SAME: Not enough evidence to say that the samples come from different
        distributions. Fail to reject the null hypothesis.
    _UNKNOWN: Not enough evidence to say that the samples come from different
        distributions, but it looks a little suspicious, and we would like more
        data before making a final decision.
  """
  if not (values_a and values_b):
    # A sample has no values in it.
    return _UNKNOWN

  # MWU is bad at detecting changes in variance, and K-S is bad with discrete
  # distributions. So use both. We want low p-values for the below examples.
  #        a                     b               MWU(a, b)  KS(a, b)
  # [0]*20            [0]*15+[1]*5                0.0097     0.4973
  # range(10, 30)     range(10)+range(30, 40)     0.4946     0.0082
  p_value = min(
      kolmogorov_smirnov.KolmogorovSmirnov(values_a, values_b),
      mann_whitney_u.MannWhitneyU(values_a, values_b))

  if p_value < _SIGNIFICANCE_LEVEL:
    # The p-value is less than the significance level. Reject the null
    # hypothesis.
    return _DIFFERENT

  index = min(len(values_a), len(values_b)) / 10
  index = min(index, len(_QUESTIONABLE_SIGNIFICANCE_LEVELS) - 1)
  questionable_significance_level = _QUESTIONABLE_SIGNIFICANCE_LEVELS[index]
  if p_value < questionable_significance_level:
    # The p-value is not less than the significance level, but it's small enough
    # to be suspicious. We'd like to investigate more closely.
    return _UNKNOWN

  # The p-value is quite large. We're not suspicious that the two samples might
  # come from different distributions, and we don't care to investigate more.
  return _SAME
예제 #3
0
 def testAllValuesIdentical(self):
     self.assertEqual(
         kolmogorov_smirnov.KolmogorovSmirnov([0] * 5, [0] * 5), 1.0)
예제 #4
0
 def testSmallSamples(self):
     self.assertEqual(kolmogorov_smirnov.KolmogorovSmirnov([0], [1]),
                      0.2890414283708268)
예제 #5
0
 def testDuplicateValues(self):
     self.assertAlmostEqual(
         kolmogorov_smirnov.KolmogorovSmirnov([0] * 5, [1] * 5),
         0.0037813540593701006)