def testProbabilisticInterleaveWithDeterministicRankers(self): pi = ProbabilisticInterleave(None) # test a few possible interleavings r1 = DeterministicRankingFunction(None, self.weights_1) r2 = DeterministicRankingFunction(None, self.weights_2) test_lists = {"0,1,3,2": 0, "1,0,3,2": 0, "1,3,0,2": 0, "1,3,2,0": 0} trials = 0 MAX_TRIALS = 10000 while trials < MAX_TRIALS and 0 in test_lists.values(): trials += 1 (l, a) = pi.interleave(r1, r2, self.query, 10) list_str = ",".join(str(a) for a in l.tolist()) self.assertIn(list_str, test_lists.keys()) test_lists[list_str] += 1 for list_str, count in test_lists.items(): self.assertNotEqual(0, count, "Interleave failed for: %s" % list_str) # test interleaving outcomes context = (None, r1, r2) self.assertEqual(pi.infer_outcome([0, 1, 2, 3], context, [0, 0, 0, 0], self.query), 0, "No clicks, outcome should be 0.") self.assertEqual(pi.infer_outcome([0, 1, 2, 3], context, [1, 0, 0, 0], self.query), 0, "No possible assignment, outcome should be 0.") o = pi.infer_outcome([1, 0, 3, 2], context, [1, 0, 0, 0], self.query) self.assertAlmostEquals(o, -0.0625, 4, "Ranker 1 should win (o = %.4f)." % o) o = pi.infer_outcome([0, 1, 3, 2], context, [1, 0, 0, 0], self.query) self.assertAlmostEquals(o, 0.0625, 4, "Ranker 2 should win (o = %.4f)." % o) # test get_probability_of_list p = pi.get_probability_of_list([1, 0, 3, 2], context, self.query) self.assertEqual(p, 0.25, "Probability of the most " "likely list. p = %g" % p)
def testProbabilisticInterleaveWithDeterministicRankers(self): pi = ProbabilisticInterleave(None) # test a few possible interleavings r1 = DeterministicRankingFunction(None, self.weights_1) r2 = DeterministicRankingFunction(None, self.weights_2) test_lists = {"0,1,3,2": 0, "1,0,3,2": 0, "1,3,0,2": 0, "1,3,2,0": 0} trials = 0 MAX_TRIALS = 10000 while trials < MAX_TRIALS and 0 in test_lists.values(): trials += 1 (l, a) = pi.interleave(r1, r2, self.query, 10) list_str = ",".join(str(a) for a in l.tolist()) self.assertIn(list_str, test_lists.keys()) test_lists[list_str] += 1 for list_str, count in test_lists.items(): self.assertNotEqual(0, count, "Interleave failed for: %s" % list_str) # test interleaving outcomes context = (None, r1, r2) self.assertEqual( pi.infer_outcome([0, 1, 2, 3], context, [0, 0, 0, 0], self.query), 0, "No clicks, outcome should be 0.") self.assertEqual( pi.infer_outcome([0, 1, 2, 3], context, [1, 0, 0, 0], self.query), 0, "No possible assignment, outcome should be 0.") o = pi.infer_outcome([1, 0, 3, 2], context, [1, 0, 0, 0], self.query) self.assertAlmostEquals(o, -0.0625, 4, "Ranker 1 should win (o = %.4f)." % o) o = pi.infer_outcome([0, 1, 3, 2], context, [1, 0, 0, 0], self.query) self.assertAlmostEquals(o, 0.0625, 4, "Ranker 2 should win (o = %.4f)." % o) # test get_probability_of_list p = pi.get_probability_of_list([1, 0, 3, 2], context, self.query) self.assertEqual(p, 0.25, "Probability of the most " "likely list. p = %g" % p)
def infer_outcome(self, l, context, c, query): # infer live outcome live_outcome = ProbabilisticInterleave.infer_outcome( self, l, context, c, query) # The following seems to work only from Python 3 onwards (currently # using 2.7). #live_outcome = super().infer_outcome(l, context, c, query) # For each historic data point, infer outcome under the target rankers # and re-weight outcomes using importance sampling h_outcomes = [] for h_item in self.history: # use the current context (rankers), but historical list and clicks raw_outcome = ProbabilisticInterleave.infer_outcome( self, h_item.result_list, context, h_item.clicks, h_item.query) # probability of the result list under the target distribution p_list_target = self.get_probability_of_list( h_item.result_list, context, h_item.query) if self.biased: weight = 1.0 else: weight = p_list_target / h_item.p_list_source h_outcomes.append(raw_outcome * weight) # TODO: implement alternatives # How to actually combine the two estimates? Supposedly, they # are both estimates of the expected value of the comparison outcome # under the target distribution (rankers), so we should just be able to # average them out? But then, the estimator based on historical data # has a much higher variance than the live estimator (in fact, # infinitely higher, because we only have one estimate from live data # so that the variance is zero) combined_outcome = .0 mean_hist = mean(h_outcomes) if len(h_outcomes) > 0 else .0 if live_outcome == .0 and mean_hist != .0: combined_outcome = mean_hist elif live_outcome != .0 and mean_hist == .0: combined_outcome = live_outcome else: var_live = 1.0 var_hist = var(h_outcomes) if len(h_outcomes) > 1 else 1000.0 combined_outcome = ( (var_live * mean_hist + var_hist * live_outcome) / (var_live + var_hist)) # add current live data point to history (and keep below or at length # self.history_length) if self.history_length > 0: if len(self.history) and len(self.history) == self.history_length: self.history.pop(0) # store probability of the observed list under the source # distribution so that it only has to be computed once new_h_item = HistoryItem(l, context, c, query) new_h_item.p_list_source = self.get_probability_of_list( l, context, query) self.history.append(new_h_item) # return the combined outcome return combined_outcome
def infer_outcome(self, l, context, c, query): # infer live outcome live_outcome = ProbabilisticInterleave.infer_outcome(self, l, context, c, query) # The following seems to work only from Python 3 onwards (currently # using 2.7). # live_outcome = super().infer_outcome(l, context, c, query) # For each historic data point, infer outcome under the target rankers # and re-weight outcomes using importance sampling h_outcomes = [] for h_item in self.history: # use the current context (rankers), but historical list and clicks raw_outcome = ProbabilisticInterleave.infer_outcome( self, h_item.result_list, context, h_item.clicks, h_item.query ) # probability of the result list under the target distribution p_list_target = self.get_probability_of_list(h_item.result_list, context, h_item.query) if self.biased: weight = 1.0 else: weight = p_list_target / h_item.p_list_source h_outcomes.append(raw_outcome * weight) # TODO: implement alternatives # How to actually combine the two estimates? Supposedly, they # are both estimates of the expected value of the comparison outcome # under the target distribution (rankers), so we should just be able to # average them out? But then, the estimator based on historical data # has a much higher variance than the live estimator (in fact, # infinitely higher, because we only have one estimate from live data # so that the variance is zero) combined_outcome = 0.0 mean_hist = mean(h_outcomes) if len(h_outcomes) > 0 else 0.0 if live_outcome == 0.0 and mean_hist != 0.0: combined_outcome = mean_hist elif live_outcome != 0.0 and mean_hist == 0.0: combined_outcome = live_outcome else: var_live = 1.0 var_hist = var(h_outcomes) if len(h_outcomes) > 1 else 1000.0 combined_outcome = (var_live * mean_hist + var_hist * live_outcome) / (var_live + var_hist) # add current live data point to history (and keep below or at length # self.history_length) if self.history_length > 0: if len(self.history) and len(self.history) == self.history_length: self.history.pop(0) # store probability of the observed list under the source # distribution so that it only has to be computed once new_h_item = HistoryItem(l, context, c, query) new_h_item.p_list_source = self.get_probability_of_list(l, context, query) self.history.append(new_h_item) # return the combined outcome return combined_outcome
def testProbabilisticInterleave(self): pi = ProbabilisticInterleave(None) r1 = ProbabilisticRankingFunction(3, self.weights_1) r2 = ProbabilisticRankingFunction(3, self.weights_2) context = (None, r1, r2) # test get_probability_of_list p = pi.get_probability_of_list([1, 0, 3, 2], context, self.query) self.assertAlmostEquals(p, 0.182775, 6, "Probability of the most " "likely list. p = %.6f" % p) # test a few possible interleavings test_lists = {"0,1,2,3": 0, "0,1,3,2": 0, "0,2,1,3": 0, "0,2,3,1": 0, "0,3,1,2": 0, "0,3,2,1": 0, "1,0,2,3": 0, "1,0,3,2": 0, "1,2,0,3": 0, "1,2,3,0": 0, "1,3,0,2": 0, "1,3,2,0": 0, "2,0,1,3": 0, "2,0,3,1": 0, "2,1,0,3": 0, "2,1,3,0": 0, "2,3,0,1": 0, "2,3,1,0": 0, "3,0,1,2": 0, "3,0,2,1": 0, "3,1,0,2": 0, "3,1,2,0": 0, "3,2,0,1": 0, "3,2,1,0": 0} trials = 0 MAX_TRIALS = 100000 while trials < MAX_TRIALS and 0 in test_lists.values(): trials += 1 (l, _) = pi.interleave(r1, r2, self.query, 10) list_str = ",".join(str(a) for a in l.tolist()) self.assertIn(list_str, test_lists.keys()) test_lists[list_str] += 1 for list_str, count in test_lists.items(): self.assertNotEqual(0, count, "Interleave failed for: %s" % list_str) # test interleaving outcomes self.assertEqual(pi.infer_outcome([0, 1, 2, 3], context, [0, 0, 0, 0], self.query), 0, "No clicks, outcome should be 0.") o = pi.infer_outcome([1, 0, 3, 2], context, [1, 0, 0, 0], self.query) self.assertAlmostEquals(o, -0.0486, 4, "Ranker 1 should win (o = %.4f)." % o) o = pi.infer_outcome([0, 1, 3, 2], context, [1, 0, 0, 0], self.query) self.assertAlmostEquals(o, 0.0606, 4, "Ranker 2 should win (o = %.4f)." % o) # from the example in CIKM 2011 weight_str_1 = "0 0 1 0 -1 0" weights_1 = np.asarray([float(x) for x in weight_str_1.split()]) weight_str_2 = "1 0 0 0 -1 0" weights_2 = np.asarray([float(x) for x in weight_str_2.split()]) r1 = ProbabilisticRankingFunction(3, weights_1) r2 = ProbabilisticRankingFunction(3, weights_2) context = (None, r2, r1) o = pi.infer_outcome([0, 1, 2, 3], context, [0, 1, 1, 0], self.query) self.assertAlmostEquals(o, 0.0046, 4, "Ranker 2 should win again (o = %.4f)." % o) # click on one before last document o = pi.infer_outcome([3, 1, 0, 2], context, [0, 0, 1, 0], self.query) self.assertAlmostEquals(o, -0.0496, 4, "Ranker 1 should win with click on doc 0 (o = %.4f)." % o) # click on last document o = pi.infer_outcome([3, 1, 2, 0], context, [0, 0, 0, 1], self.query) self.assertAlmostEquals(o, 0.0, 4, "Tie for click on last doc (o = %.4f)." % o)
class HistProbabilisticInterleave(AbstractHistInterleavedComparison): """Probabilistic interleaving using historical data""" def __init__(self, arg_str=None): self.pi = ProbabilisticInterleave(arg_str) self.biased = False self.marginalize = True if arg_str: parser = argparse.ArgumentParser(description="Parse arguments for " "interleaving method.", prog=self.__class__.__name__) parser.add_argument("-b", "--biased") parser.add_argument("-m", "--marginalize") args = vars(parser.parse_known_args(split_arg_str(arg_str))[0]) if args["biased"] == "False" or args["biased"] == None \ or args["biased"] == 0: self.biased = False else: self.biased = True if args["marginalize"] == "False" or args["marginalize"] == 0: self.marginalize = False else: self.marginalize = True def infer_outcome(self, l, source_context, c, target_r1, target_r2, query): # for prob interleave, a = (a, r1, r2) (a, r1, r2) = source_context if self.marginalize: return self._infer_outcome_with_marginalization(l, a, c, r1, r2, target_r1, target_r2, query, self.biased) else: return self._infer_outcome_no_marginalization(l, a, c, r1, r2, target_r1, target_r2, query, self.biased) def _infer_outcome_with_marginalization(self, l, a, c, r1, r2, target_r1, target_r2, query, biased): # get outcome using the target rankers target_context = (None, target_r1, target_r2) if r1 == r2: raise ValueError("r1 and r2 cannot point to the same object.") outcome = self.pi.infer_outcome(l, target_context, c, query) if outcome == 0: return 0 if biased: return outcome # if biased is False, compensate for bias using importance sampling target_p_list = self.pi.get_probability_of_list(l, target_context, query) orig_context = (None, r1, r2) orig_p_list = self.pi.get_probability_of_list(l, orig_context, query) if target_p_list == 0 or orig_p_list == 0: logging.warn("Encountered zero probabilities: p(l_target) = %.2f, " "p(l_orig) = %.2f" % (target_p_list, orig_p_list)) return 0 return outcome * target_p_list / orig_p_list def _infer_outcome_no_marginalization(self, l, a, c, r1, r2, target_r1, target_r2, query, biased): # are there any clicks? (otherwise it's a tie) click_ids = where(asarray(c) == 1) if not len(click_ids[0]): # no clicks, will be a tie return 0 # for the observed list and assignment, get the outcome (like TD) c1 = sum([1. if val_a == 0 and val_c == 1 else .0 for val_a, val_c in zip(a, c)]) c2 = sum([1. if val_a == 1 and val_c == 1 else .0 for val_a, val_c in zip(a, c)]) outcome = -1. if c1 > c2 else 1 if c2 > c1 else .0 if biased: return outcome # if biased is False, compensate for bias using importance sampling # get the probability of observing this list and assignment under # target and source distribution target_p = self._get_probability_of_list_and_assignment(l, a, target_r1, target_r2, query) if target_p == 0: return .0 orig_p = self._get_probability_of_list_and_assignment(l, a, r1, r2, query) if orig_p == 0: return .0 r2.init_ranking(query) return outcome * target_p / orig_p def _get_probability_of_list_and_assignment(self, l, a, r1, r2, query): # P(l) = \prod_{doc in result_list} 1/2 P_1(doc) + 1/2 P_2(doc) p_l_a = 1.0 r1.init_ranking(query) r2.init_ranking(query) for i, doc in enumerate(l): if a[i] == -1: p_d = r1.get_document_probability(doc) elif a[i] == 0: p_d = r1.get_document_probability(doc) elif a[i] == 1: p_d = r2.get_document_probability(doc) else: logging.warn("Illegal assignment: ", a) return .0 p_l_a *= 0.5 * p_d r1.rm_document(doc) r2.rm_document(doc) return p_l_a
class ExploitativeProbabilisticInterleave(AbstractInterleavedComparison): """Probabilistic interleaving that balances exploration with exploitation, marginalizes over assignments.""" def __init__(self, arg_str=None): self.pi = ProbabilisticInterleave(arg_str) if arg_str: parser = argparse.ArgumentParser(description="Parse arguments for " "interleaving method.", prog=self.__class__.__name__) parser.add_argument("-a", "--aggregate", choices=["expectation", "log-likelihood-ratio", "likelihood-ratio", "log-ratio", "binary"], default="expectation") parser.add_argument("-e", "--exploration_rate", type=float, required=True, help="Exploration rate, 0.5 = perfect " "exploration, 0.0 = perfect exploitation.") parser.add_argument("-b", "--biased", default="False") args = vars(parser.parse_known_args(split_arg_str(arg_str))[0]) self.exploration_rate = args["exploration_rate"] self.aggregate = args["aggregate"] self.biased = string_to_boolean(args["biased"]) else: raise ValueError("Configuration arguments required. Please provide" " at least a value for the exploration rate.") def _get_document_distribution(self, r1, r2): """Compute the combined distribution over documents. Return two lists: (docids, probs), where probs[i] is the probability of the document docids[i].""" docids = r1.get_ranking() probs = [] for docid in docids: p1 = r1.get_document_probability(docid) p2 = r2.get_document_probability(docid) # always assume r1 as the exploitative ranking probs.append((1 - self.exploration_rate) * p1 + self.exploration_rate * p2) # sort docids and probs by probability # the sort is stable, ties are broken according to ranking r1 probs, docids = zip(*sorted(zip(probs, docids), reverse=True)) probs = asarray(probs) docids = list(docids) return (docids, probs) def interleave(self, r1, r2, query, length): # compute combined probabilities r1.init_ranking(query) r2.init_ranking(query) length = min(r1.document_count(), r2.document_count(), length) l = [] while len(l) < length: (docids, probs) = self._get_document_distribution(r1, r2) # sample from distribution without replacement # unless there is only one document left if len(docids) == 1: pick = docids.pop(0) else: # same as in ProbablisticRankingFunction cumprobs = cumsum(probs) pick = -1 rand = random() # produces a float in range [0.0, 1.0) for pos, cp in enumerate(cumprobs): if rand < cp: pick = docids.pop(pos) # pop, because it's a list break if (pick == -1): print "Cumprobs:", cumprobs print "rand", rand raise Exception("Could not select document!") # remove picked docid from r1 and r2 r1.rm_document(pick) r2.rm_document(pick) l.append(pick) return (asarray(l), (r1, r2)) def _get_source_probability_of_list(self, l, a, query): p_l = 1.0 (_, r1, r2) = a r1.init_ranking(query) r2.init_ranking(query) for _, doc in enumerate(l): p_r1 = r1.get_document_probability(doc) p_r2 = r2.get_document_probability(doc) r1.rm_document(doc) r2.rm_document(doc) p_l *= ((1 - self.exploration_rate) * p_r1 + self.exploration_rate * p_r2) return p_l def infer_outcome(self, l, a, c, query): # for prob interleave, a = (a, r1, r2) outcome = self.pi.infer_outcome(l, a, c, query) # return outcome unless bias compensation is needed if outcome == 0: return 0 if self.biased or self.exploration_rate == 0.5: return outcome # apply importance sampling for unbiased outcomes target_p_list = self.pi.get_probability_of_list(l, a, query) source_p_list = self._get_source_probability_of_list(l, a, query) if target_p_list == 0 or source_p_list == 0: logging.warn("Encountered zero probabilities: p(l_target) = %.2f, " "p(l_source) = %.2f" % (target_p_list, source_p_list)) return 0 return outcome * target_p_list / source_p_list
def testProbabilisticInterleave(self): pi = ProbabilisticInterleave(None) r1 = ProbabilisticRankingFunction(3, self.weights_1) r2 = ProbabilisticRankingFunction(3, self.weights_2) context = (None, r1, r2) # test get_probability_of_list p = pi.get_probability_of_list([1, 0, 3, 2], context, self.query) self.assertAlmostEquals( p, 0.182775, 6, "Probability of the most " "likely list. p = %.6f" % p) # test a few possible interleavings test_lists = { "0,1,2,3": 0, "0,1,3,2": 0, "0,2,1,3": 0, "0,2,3,1": 0, "0,3,1,2": 0, "0,3,2,1": 0, "1,0,2,3": 0, "1,0,3,2": 0, "1,2,0,3": 0, "1,2,3,0": 0, "1,3,0,2": 0, "1,3,2,0": 0, "2,0,1,3": 0, "2,0,3,1": 0, "2,1,0,3": 0, "2,1,3,0": 0, "2,3,0,1": 0, "2,3,1,0": 0, "3,0,1,2": 0, "3,0,2,1": 0, "3,1,0,2": 0, "3,1,2,0": 0, "3,2,0,1": 0, "3,2,1,0": 0 } trials = 0 MAX_TRIALS = 100000 while trials < MAX_TRIALS and 0 in test_lists.values(): trials += 1 (l, _) = pi.interleave(r1, r2, self.query, 10) list_str = ",".join(str(a) for a in l.tolist()) self.assertIn(list_str, test_lists.keys()) test_lists[list_str] += 1 for list_str, count in test_lists.items(): self.assertNotEqual(0, count, "Interleave failed for: %s" % list_str) # test interleaving outcomes self.assertEqual( pi.infer_outcome([0, 1, 2, 3], context, [0, 0, 0, 0], self.query), 0, "No clicks, outcome should be 0.") o = pi.infer_outcome([1, 0, 3, 2], context, [1, 0, 0, 0], self.query) self.assertAlmostEquals(o, -0.0486, 4, "Ranker 1 should win (o = %.4f)." % o) o = pi.infer_outcome([0, 1, 3, 2], context, [1, 0, 0, 0], self.query) self.assertAlmostEquals(o, 0.0606, 4, "Ranker 2 should win (o = %.4f)." % o) # from the example in CIKM 2011 weight_str_1 = "0 0 1 0 -1 0" weights_1 = np.asarray([float(x) for x in weight_str_1.split()]) weight_str_2 = "1 0 0 0 -1 0" weights_2 = np.asarray([float(x) for x in weight_str_2.split()]) r1 = ProbabilisticRankingFunction(3, weights_1) r2 = ProbabilisticRankingFunction(3, weights_2) context = (None, r2, r1) o = pi.infer_outcome([0, 1, 2, 3], context, [0, 1, 1, 0], self.query) self.assertAlmostEquals(o, 0.0046, 4, "Ranker 2 should win again (o = %.4f)." % o) # click on one before last document o = pi.infer_outcome([3, 1, 0, 2], context, [0, 0, 1, 0], self.query) self.assertAlmostEquals( o, -0.0496, 4, "Ranker 1 should win with click on doc 0 (o = %.4f)." % o) # click on last document o = pi.infer_outcome([3, 1, 2, 0], context, [0, 0, 0, 1], self.query) self.assertAlmostEquals(o, 0.0, 4, "Tie for click on last doc (o = %.4f)." % o)