def compute_mean(metric_key, labels, predictions, weights=None, topn=None, name=None): """Returns the mean of the specified metric given the inputs. Args: metric_key: A key in `RankingMetricKey`. labels: A `Tensor` of the same shape as `predictions` representing relevance. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. topn: An `integer` specifying the cutoff of how many items are considered in the metric. name: A `string` used as the name for this metric. Returns: A scalar as the computed metric. """ metric_dict = { RankingMetricKey.ARP: metrics_impl.ARPMetric(metric_key), RankingMetricKey.MRR: metrics_impl.MRRMetric(metric_key, topn), RankingMetricKey.NDCG: metrics_impl.NDCGMetric(name, topn), RankingMetricKey.DCG: metrics_impl.DCGMetric(name, topn), RankingMetricKey.PRECISION: metrics_impl.PrecisionMetric(name, topn), RankingMetricKey.RECALL: metrics_impl.RecallMetric(name, topn), RankingMetricKey.MAP: metrics_impl.MeanAveragePrecisionMetric(name, topn), RankingMetricKey.ORDERED_PAIR_ACCURACY: metrics_impl.OPAMetric(name), RankingMetricKey.BPREF: metrics_impl.BPrefMetric(name, topn), RankingMetricKey.HITS: metrics_impl.HitsMetric(metric_key, topn), } assert metric_key in metric_dict, ('metric_key %s not supported.' % metric_key) # TODO: Add mask argument for metric.compute() call metric, weight = metric_dict[metric_key].compute(labels, predictions, weights) return tf.compat.v1.div_no_nan(tf.reduce_sum(input_tensor=metric * weight), tf.reduce_sum(input_tensor=weight))
def binary_preference(labels, predictions, weights=None, topn=None, name=None, use_trec_version=True): """Computes binary preference (BPref). The implementation of BPref is based on the desciption in the following: https://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf BPref = 1 / R SUM_r(1 - |n ranked higher than r| / min(R, N)) Args: labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a relevant example. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. topn: A cutoff for how many examples to consider for this metric. name: A string used as the name for this metric. use_trec_version: A boolean to choose the version of the formula to use. If False, than the alternative BPref formula will be used: BPref = 1 / R SUM_r(1 - |n ranked higher than r| / R) Returns: A metric for binary preference metric of the batch. """ metric = metrics_impl.BPrefMetric(name, topn, use_trec_version=use_trec_version) with tf.compat.v1.name_scope(metric.name, 'binary_preference', (labels, predictions, weights)): # TODO: Add mask argument for metric.compute() call per_list_bpref, per_list_weights = metric.compute( labels, predictions, weights) return tf.compat.v1.metrics.mean(per_list_bpref, per_list_weights)