def test_err_perfect_first_trumps_many_good(): """Tests that a perfect document at rank 1 trumps later rankings. The authors of [1] list this as a motivating example. A ranking that puts a "perfect" document at rank 1 (i.e. one that is almost certain to satisfy the user's needs) should trump one that puts a "good" one at rank 1, regardless of the documents at later ranks. The reasoning is that later ranks won't need to be examined when the first is already sufficient. References ---------- [1] Chapelle, Olivier, et al. "Expected reciprocal rank for graded relevance." Proceedings of the 18th ACM conference on Information and knowledge management. ACM, 2009. http://olivier.chapelle.cc/pub/err.pdf """ y_true = ranking_ordering_conversion([range(20)]) # gets the "perfect" one right, everything else wrong perfect_first = ranking_ordering_conversion([ [0, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] ]) # does pretty good for most, but ranks the "perfect" one wrong all_good = ranking_ordering_conversion([ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0] ]) assert K.eval(err(y_true, perfect_first)) > K.eval(err(y_true, all_good))
def test_err_against_manually_verified_example(): """Compares the implementation against a manual calculation.""" y_true = ranking_ordering_conversion([[1, 2, 0]]) y_pred = ranking_ordering_conversion([[2, 1, 0]]) # The resulting probabilities that each document satisfies the # user's need: # [2**1-1, 2**2-1, 2**0 - 1] / 2**2 = [1/4, 3/4, 0] # Multiplied by the respective rank utilities (1/(r+1)): # [(1/4)/3, (3/4)/2, 0/1] = [1/12, 3/8, 0] # The resulting ERR: # We ranked object 2 first, which has a true rank of 1 and therefore # (with the relevance gain probability mapping) a probability of # (2**(2-1)-1) / 2**2 = 1/4 # of matching the user's need. It is at rank 0, which has utility # 1/(0+1) = 1. # Object 1 is next. True rank of 0, probability # (2**(2-0)-1) / 2**2 = 3/4 # and utility # 1/(1+1) = 1/2. # Object 0 last. True rank of 2, probability # (2**(2-2)-1) / 2**2 = 0 # and utility # 1/(2+1) = 1/3. # The resulting expected utility: # 1/4 * 1 + (1 - 1/4) * 3/4 * 1/2 + (1 - 1/4) * (1 - 3/4) * 0 * 1/3 # = 17/32 # Approx because comparing floats is inherently error-prone. assert K.eval(err(y_true, y_pred)) == approx(17/32)
def test_err_implementations_equivalent(): """Spot-checks equivalence of plain python and tf implementations""" # A simple grading where each grade occurs once. We want to check # for equivalence at every permutation of this grading. elems = np.array([4, 3, 2, 1, 0]) y_true = np.reshape(elems, (1, -1)) # Spot check some permutations (5! / 20 = 6 checks are performed) for perm in list(itertools.permutations(elems))[::20]: perm = np.reshape(perm, (1, -1)) assert K.eval(err(y_true, perm)) == approx(err_np(y_true, perm))