def average_precision_score(y_true, y_score, average="macro", pos_label=1, sample_weight=None): def _binary_uninterpolated_average_precision(y_true, y_score, pos_label=1, sample_weight=None): precision, recall, _ = precision_recall_curve_modified( y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) # Return the step function integral # The following works because the last entry of precision is # guaranteed to be 1, as returned by precision_recall_curve return -np.sum(np.diff(recall) * np.array(precision)[:-1]) y_type = type_of_target(y_true) if y_type == "multilabel-indicator" and pos_label != 1: raise ValueError("Parameter pos_label is fixed to 1 for " "multilabel-indicator y_true. Do not set " "pos_label or set pos_label to 1.") elif y_type == "binary": present_labels = np.unique(y_true) if len(present_labels) == 2 and pos_label not in present_labels: raise ValueError("pos_label=%r is invalid. Set it to a label in " "y_true." % pos_label) average_precision = partial(_binary_uninterpolated_average_precision, pos_label=pos_label) return _average_binary_score(average_precision, y_true, y_score, average, sample_weight=sample_weight)
def average_precision_score(y_true, y_score, average="macro", sample_weight=None): def _binary_average_precision(y_true, y_score, sample_weight=None): precision, recall, thresholds = precision_recall_curve( y_true, y_score, sample_weight=sample_weight) return auc(recall, precision) return _average_binary_score(_binary_average_precision, y_true, y_score, average, sample_weight=sample_weight)
def average_precision_score(y_true, y_score, average="macro", pos_label=1, sample_weight=None): def _binary_uninterpolated_average_precision(y_true, y_score, pos_label=1, sample_weight=None): precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label, sample_weight) recall[np.isnan(recall)] = 0 return -np.sum(np.diff(recall) * np.array(precision)[:-1]) average_precision = functools.partial(_binary_uninterpolated_average_precision, pos_label=pos_label) return _average_binary_score(average_precision, y_true, y_score, average, sample_weight)
def test_averaging_binary_multilabel_all_zeroes(): y_true = np.zeros((20, 3)) y_pred = np.zeros((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred # Test _average_binary_score for weight.sum() == 0 binary_metric = (lambda y_true, y_score, average="macro": _average_binary_score( precision_score, y_true, y_score, average)) _check_averaging(binary_metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel=True)
def test_averaging_binary_multilabel_all_zeroes(): y_true = np.zeros((20, 3)) y_pred = np.zeros((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred # Test _average_binary_score for weight.sum() == 0 binary_metric = (lambda y_true, y_score, average="macro": _average_binary_score( precision_score, y_true, y_score, average)) _check_averaging(binary_metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel=True)
def binary_average_precision(y_true, y_score, interpolated_auc=True): def _average_precision(y_true_, y_score_, sample_weight=None): precision, recall, _ = precision_recall_curve(y_true_, y_score_, sample_weight) if not interpolated_auc: # Return the step function integral # The following works because the last entry of precision is # guaranteed to be 1, as returned by precision_recall_curve return -1 * np.sum(np.diff(recall) * np.array(precision)[:-1]) return auc(recall, precision) return _average_binary_score(_average_precision, y_true, y_score, average="macro")
def pr_auc_score(y_true, y_score, average='micro', sample_weight=None): def _binary_pr_auc_score(y_true, y_score, sample_weight=None): if len(np.unique(y_true)) != 2: raise ValueError("Only one class present in y_true. AUPRC score " "is not defined in that case.") fpr, tpr, thresholds = precision_recall_curve( y_true, y_score, sample_weight=sample_weight) return auc(tpr, fpr, reorder=False) return _average_binary_score(_binary_pr_auc_score, y_true, y_score, average, sample_weight=sample_weight)
def test_averaging_multilabel_all_zeroes(): y_true = np.zeros((20, 3)) y_pred = np.zeros((20, 3)) y_score = np.zeros((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred for name in METRICS_WITH_AVERAGING: yield (check_averaging, name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) # Test _average_binary_score for weight.sum() == 0 binary_metric = lambda y_true, y_score, average="macro": _average_binary_score( precision_score, y_true, y_score, average ) _check_averaging(binary_metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel=True)
def test_averaging_multilabel_all_zeroes(): y_true = np.zeros((20, 3)) y_pred = np.zeros((20, 3)) y_score = np.zeros((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred for name in METRICS_WITH_AVERAGING: yield (check_averaging, name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) # Test _average_binary_score for weight.sum() == 0 binary_metric = (lambda y_true, y_score, average="macro": _average_binary_score( precision_score, y_true, y_score, average)) _check_averaging(binary_metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel=True)
def VOC_mAP(y_trues, y_scores, sample_weight=None, ignore_index=-100): '''Calculate mean Average Precision (mAP) for VOC outputs. Assumes [n_samples, n_classes] for y_trues, y_scores.''' if y_trues.ndim == y_scores.ndim == 1 and y_trues.shape==y_scores.shape: y_trues = y_trues[:,None] y_scores = y_scores[:,None] assert y_trues.ndim==2 and y_scores.ndim==2 and y_trues.shape==y_scores.shape rets = [] for x in range(y_scores.shape[1]): y_true, y_score = y_trues[:,x], y_scores[:,x] mask = y_true!=ignore_index y_true, y_score = y_true[mask], y_score[mask] ret = _average_binary_score(VOC_AP, y_true, y_score, average='macro', sample_weight=sample_weight if sample_weight is None else sample_weight[mask] ) rets.append(ret) ret = np.mean(rets) return ret
def average_precision_score(y_true, y_score, average="macro", sample_weight=None, interpolation="linear"): """Compute average precision (AP) from prediction scores This score corresponds to the area under the precision-recall curve, where points are joined using either linear or step-wise interpolation. Note: this implementation is restricted to the binary classification task or multilabel classification task. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, n_classes] True binary labels in binary label indicators. y_score : array, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'micro'``: Calculate metrics globally by considering each element of the label indicator matrix as a label. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). ``'samples'``: Calculate metrics for each instance, and find their average. sample_weight : array-like of shape = [n_samples], optional Sample weights. interpolation : string ['linear' (default), 'step'] Determines the kind of interpolation used when computed AUC. If there are many repeated scores, 'step' is recommended to avoid under- or over- estimating the AUC. See www.roamanalytics.com/etc for details. ``'linear'``: Linearly interpolates between operating points. ``'step'``: Uses a step function to interpolate between operating points. Returns ------- average_precision : float References ---------- .. [1] `Wikipedia entry for the Average precision <http://en.wikipedia.org/wiki/Average_precision>`_ See also -------- roc_auc_score : Area under the ROC curve precision_recall_curve : Compute precision-recall pairs for different probability thresholds Examples -------- >>> import numpy as np >>> from sklearn.metrics import average_precision_score >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> average_precision_score(y_true, y_scores) # doctest: +ELLIPSIS 0.79... """ def _binary_average_precision(y_true, y_score, sample_weight=None): precision, recall, thresholds = precision_recall_curve( y_true, y_score, sample_weight=sample_weight) return auc(recall, precision, interpolation=interpolation, interpolation_direction='right') if interpolation == "linear": # Check for number of unique predictions. If this is substantially less # than the number of predictions, linear interpolation is likely to be # biased. n_discrete_predictions = len(np.unique(y_score)) if n_discrete_predictions < 0.75 * len(y_score): warnings.warn("Number of unique scores is less than 75% of the " "number of scores provided. Linear interpolation " "is likely to be biased in this case. You may wish " "to use step interpolation instead. See docstring " "for details.") return _average_binary_score(_binary_average_precision, y_true, y_score, average, sample_weight=sample_weight)
def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, max_fpr=None): """Compute Area Under the Curve (AUC) from prediction scores Note: this implementation is restricted to the binary classification task or multilabel classification task in label indicator format. Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, n_classes] True binary labels in binary label indicators. y_score : array, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'micro'``: Calculate metrics globally by considering each element of the label indicator matrix as a label. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). ``'samples'``: Calculate metrics for each instance, and find their average. sample_weight : array-like of shape = [n_samples], optional Sample weights. max_fpr : float, optional If not ``None``, the standardized partial AUC over the range [0, max_fpr] is returned. Returns ------- auc : float References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ .. [2] `Analyzing a portion of the ROC curve. McClish, 1989 <http://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ See also -------- average_precision_score : Area under the precision-recall curve roc_curve : Compute Receiver operating characteristic (ROC) Examples -------- >>> import numpy as np >>> from sklearn.metrics import roc_auc_score >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> roc_auc_score(y_true, y_scores) 0.75 """ def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=max_fpr): if len(np.unique(y_true)) != 2: raise ValueError("Only one class present in y_true. ROC AUC score " "is not defined in that case.") fpr, tpr, tresholds = roc_curve(y_true, y_score, sample_weight=sample_weight) if max_fpr: idx = np.where(fpr <= max_fpr)[0] # linearly interpolate the ROC curve until max_fpr idx_last = idx.max() idx_next = idx_last + 1 xc = [fpr[idx_last], fpr[idx_next]] yc = [tpr[idx_last], fpr[idx_next]] tpr = np.r_[tpr[idx], np.interp(max_fpr, xc, yc)] fpr = np.r_[fpr[idx], max_fpr] partial_roc = auc(fpr, tpr, reorder=True) # standardize result to lie between 0.5 and 1 min_area = max_fpr**2/2 max_area = max_fpr return 0.5*(1+(partial_roc-min_area)/(max_area-min_area)) return auc(fpr, tpr, reorder=True) return _average_binary_score( _binary_roc_auc_score, y_true, y_score, average, sample_weight=sample_weight)
def average_precision_score(y_true, y_score, average="macro", sample_weight=None): """Compute average precision (AP) from prediction scores This score corresponds to the area under the precision-recall curve. Note: this implementation is restricted to the binary classification task or multilabel classification task. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, n_classes] True binary labels in binary label indicators. y_score : array, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'micro'``: Calculate metrics globally by considering each element of the label indicator matrix as a label. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). ``'samples'``: Calculate metrics for each instance, and find their average. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- average_precision : float References ---------- .. [1] `Wikipedia entry for the Average precision <https://en.wikipedia.org/wiki/Average_precision>`_ See also -------- roc_auc_score : Area under the ROC curve precision_recall_curve : Compute precision-recall pairs for different probability thresholds Examples -------- >>> import numpy as np >>> from sklearn.metrics import average_precision_score >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> average_precision_score(y_true, y_scores) # doctest: +ELLIPSIS 0.79... """ def _binary_average_precision(y_true, y_score, sample_weight=None): precision, recall, thresholds = precision_recall_curve( y_true, y_score, sample_weight=sample_weight) return auc(recall, precision) return _average_binary_score(_binary_average_precision, y_true, y_score, average, sample_weight=sample_weight)