def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0,), (0, 2)] with pytest.raises(ValueError): lb.transform(multi_label) lb = LabelBinarizer() with pytest.raises(ValueError): lb.transform([]) with pytest.raises(ValueError): lb.inverse_transform([]) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError): LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type with pytest.raises(ValueError): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0, ) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] with pytest.raises(ValueError): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes with pytest.raises(ValueError): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0, ) # Fail on the dimension of 'binary' with pytest.raises(ValueError): _inverse_binarize_thresholding( y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0, ) # Fail on multioutput data with pytest.raises(ValueError): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError): label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
def test_label_binarize_multilabel(): y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]]) classes = [0, 1, 2] pos_label = 2 neg_label = 0 expected = pos_label * y_ind y_sparse = [ sparse_matrix(y_ind) for sparse_matrix in [ coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, ] ] for y in [y_ind] + y_sparse: check_binarized_results(y, classes, pos_label, neg_label, expected) with pytest.raises(ValueError): label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True)
def test_label_binarize_multiclass(): y = [0, 1, 2] classes = [0, 1, 2] pos_label = 2 neg_label = 0 expected = 2 * np.eye(3) check_binarized_results(y, classes, pos_label, neg_label, expected) with pytest.raises(ValueError): label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True)
def test_label_binarize_with_class_order(): out = label_binarize([1, 6], classes=[1, 2, 4, 6]) expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]]) assert_array_equal(out, expected) # Modified class order out = label_binarize([1, 6], classes=[1, 6, 4, 2]) expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]]) assert_array_equal(out, expected) out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1]) expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]]) assert_array_equal(out, expected)
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if (pos_label == 0 or neg_label != 0) and sparse_output: with pytest.raises(ValueError): label_binarize( y, classes=classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output, ) continue # check label_binarize binarized = label_binarize( y, classes=classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output, ) assert_array_equal(toarray(binarized), expected) assert issparse(binarized) == sparse_output # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding( binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.0), ) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer( neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output ) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert issparse(binarized) == sparse_output inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert issparse(inverse_output) == issparse(y)
def pr_auc(y_true, predict_probas, labels=None): if labels is None: labels = np.unique(y_true) y_true = label_binarize(y_true, classes=labels)[:, 0] precision, recall, _ = precision_recall_curve(y_true, predict_probas) pr_auc = sklearn.metrics.auc(recall, precision) return pr_auc
def test_invalid_input_label_binarize(): with pytest.raises(ValueError): label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1) with pytest.raises(ValueError, match="continuous target data is not "): label_binarize([1.2, 2.7], classes=[0, 1]) with pytest.raises(ValueError, match="mismatch with the labels"): label_binarize([[1, 3]], classes=[1, 2, 3])
def lift(y_true, predict_probas, pct=0.05, labels=None): if labels is None: labels = np.unique(y_true) y_true = label_binarize(y_true, classes=labels)[:, 0] num_records = len(predict_probas) prediction = pd.DataFrame(data=predict_probas, columns=['prediction_proba']) prediction['label'] = y_true top_pct = math.floor(num_records * pct) top = prediction.nlargest(top_pct, ['prediction_proba']) failures = len(y_true[(y_true)]) # or 5% if failure rate > 5% num_failures_detected_in_top = len(top[top['label'] == 1]) lift = num_failures_detected_in_top / failures return lift
def _hosmer_lemeshow(y_true, predict_probas, num_groups=10, labels=None): df = pd.DataFrame(data=predict_probas, columns=['prediction_proba']) if labels is None: labels = np.unique(y_true) y_true = label_binarize(y_true, classes=labels)[:, 0] df['label'] = y_true df['quantile_rank'] = pd.qcut(df['prediction_proba'], num_groups, labels=False, duplicates='drop') h = 0 results = pd.DataFrame(columns=[ 'decile', 'lower_bound', 'upper_bound', 'num_observations', 'num_failures', 'predicted_failures' ]) for i in range(num_groups): pcat_predictions = df[df['quantile_rank'] == i] num_observations = len(pcat_predictions) if num_observations == 0: continue obs1 = len(pcat_predictions[pcat_predictions['label'] == 1]) # how many were in category 1 exp1 = pcat_predictions['prediction_proba'].mean() * num_observations lower_bound = pcat_predictions['prediction_proba'].min() upper_bound = pcat_predictions['prediction_proba'].max() obs0 = num_observations - obs1 exp0 = num_observations - exp1 h += ((obs1 - exp1)**2) / exp1 + ((obs0 - exp0)**2) / exp0 results = results.append( { 'decile': i + 1, 'lower_bound': lower_bound, 'upper_bound': upper_bound, 'num_observations': num_observations, 'num_failures': obs1, 'predicted_failures': exp1 }, ignore_index=True) p = chi2.sf(h, num_groups - 2) return h, p, results
def calculate_metrics(metrics, y_true, y_pred, labels=None): output = {} for metric_name in metrics: score_function = get_standard_metric(metric_name) if score_function is None: score_function = get_custom_metric(metric_name) if score_function is None: continue # log if metric_name in class_prediction_metrics: if labels is None: labels = np.unique(y_true) _y_true = label_binarize(y_true, classes=labels)[:, 0] score = score_function(_y_true, np.argmax(y_pred, axis=1)) else: score = score_function(y_true, y_pred[:, 1]) output[metric_name] = score return output
def test_invalid_input_label_binarize(): with pytest.raises(ValueError): label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0, ), (0, 2)] err_msg = "You appear to be using a legacy multi-label data representation." with pytest.raises(ValueError, match=err_msg): lb.transform(multi_label) lb = LabelBinarizer() err_msg = "This LabelBinarizer instance is not fitted yet" with pytest.raises(ValueError, match=err_msg): lb.transform([]) with pytest.raises(ValueError, match=err_msg): lb.inverse_transform([]) input_labels = [0, 1, 0, 1] err_msg = "neg_label=2 must be strictly less than pos_label=1." lb = LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) err_msg = "neg_label=2 must be strictly less than pos_label=2." lb = LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) err_msg = ( "Sparse binarization is only supported with non zero pos_label and zero " "neg_label, got pos_label=2 and neg_label=1") lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) with pytest.raises(ValueError, match=err_msg): lb.fit(input_labels) # Fail on y_type err_msg = "foo format is not supported" with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0, ) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] err_msg = "You appear to be using a legacy multi-label data representation" with pytest.raises(ValueError, match=err_msg): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes err_msg = "The number of class is not equal to the number of dimension of y." with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0, ) # Fail on the dimension of 'binary' err_msg = "output_type='binary', but y.shape" with pytest.raises(ValueError, match=err_msg): _inverse_binarize_thresholding( y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0, ) # Fail on multioutput data err_msg = "Multioutput target data is not supported with label binarization" with pytest.raises(ValueError, match=err_msg): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError, match=err_msg): label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
def visualize_testing_result(cls, truth_labels, pred_class_ids): ''' Use to visualize results from using a saved model on a set of test-set samples. Draws a PR curve, and adds a table with the average precison (AP) of each class. ''' # Find number of classes involved: all_class_ids = set(truth_labels) num_classes = len(all_class_ids) # Will alternately treat each class # prediction as a one-vs-all binary # classification. For each class ID (cid<n>), # get 0/1 guess separately for each sample: # # cid0 cid1 # pred_sample0 1 0 # pred_sample1 0 0 # pred_sample2 0 1 # ... # Same with labels: # cid0 cid1 # labl_sample0 1 0 # labl_sample1 0 0 # labl_sample2 0 1 # ... bin_labels = label_binarize(truth_labels, classes=list(range(num_classes))) # Make tensors just for manipulation # convenience: bin_labels_tn = torch.tensor(bin_labels) preds_tn = torch.tensor(pred_class_ids) precisions = dict() recalls = dict() average_precisions = dict() # Go through each column, i.e. the # 1/0 labels/preds for one class at # a time, and get the prec/rec numbers. # The [1] in prec & rec is b/c precision_recall_curve # returns a triplet for binary classification: # prec/rec at thresholds 0, 1, putting 1 as the # last element. The prec/rec we want is the # where 1 is the thresholds: for i in range(num_classes): bin_labels_arr = bin_labels_tn[:, i].tolist() preds_arr = preds_tn.tolist() # Get precision and recall at each # of the default thresholds: precs, recs = \ cls.compute_binary_pr_curve(bin_labels_arr, preds_arr ) precisions[i] = precs recalls[i] = recs # Avg prec is: # # AP = SUM_ovr_n((R_n - R_n-1)*P_n # # I.e. the increase in recalls times current # precisions as each pred/sample pair is # processed: average_precisions[i] = \ average_precision_score(bin_labels_arr, preds_arr, average='macro', ) mAP = np.mean(list(average_precisions.values())) return (mAP, precisions, recalls, average_precisions)
def compute_multiclass_pr_curves(cls, truth_labels, raw_preds, thresholds=[0.2, 0.4, 0.6, 0.8]): ''' Computes the data needed to draw a family of PR curves for the results of multiclass classifier output. Returns a dict of the constituent single-class curve specs, and a mean average precision (mAP) score for all curves combined. Each result dict maps a class ID to all info needed for one of the curves: 1: {'best_op_pt' : best_operating_pt, 'precisions' : precisions, 'recalls' : recalls, 'thresholds' : thresholds, 'avg_prec' : avg_precision } 2: {'best_op_pt' : best_operating_pt, 'precisions' : precisions, 'recalls' : recalls, 'thresholds' : thresholds, 'avg_prec' : avg_precision } where best_op_pt is: {'threshold' : <optimal decision probability value> 'f1' : <f1 at the optimal threshold> 'prec' : <precision at the optimal threshold> 'thresholds' : thresholds, 'rec' : <recall at the optimal threshold> } Each of the avg_prec is the the average of precisions across the samples of one class (AP). I.e. there will be as many elements in average_precisions as there are classes. The Mean Average Precision (mAP) is the mean of the average_precision values. This measure summarizes the family of PR curves. It is comparable to AUC ROC. The precisions and recalls returns are dicts. The keys are class IDs, and the values are the precisions for that class. They are the quantities from which the average_precision values are computed. Summary: o precisions/recalls are the lowest granularity of information: the per class precs and recs at different thresholds. There are as many entries in these dicts as there are classes. And prec/rec value pair from the precisions and recalls dict are results of one threshold. TODO: o finish this sentence by running and seeing what's what o A unit test for this method o Finally: the actual drawing of the curves with pyplot o average_precision aggregates the precisions of one class across multiple thresholds. There will be as many entries in this dict as there are classes. o mAP aggregates the average_precision values across all classes. This is one number. :param truth_labels: all truth labels shaped torch.Size([num-batches, batch-size]) :type truth_labels: Tensor :param raw_preds: the logits for each class for each sample as torch.Shape([num-batches, batch-size, num-classes]) :type raw_preds: Tensor :return: (precisions, recalls, average_precisions, mAP) :rtype: ({int : [floats]}, {int : [floats]}, [floats], float) ''' (num_batches, batch_size, num_classes) = raw_preds.shape num_samples = num_batches * batch_size # Will alternately treat each class # prediction as a one-vs-all binary # classification. # # Ex. let labels = [1,0,0,1,2] # and preds = [0.3,0.6,0.1,0.7,0.9] # # Convert the labels to a one-hot vector; # the width of the binarized labels is # num_classes: # # L A B E L S P R E D S # ------------ ---------- # [1, [[0, 1, 0], [0.3, # 0, [1, 0, 0], 0.6, # 0, ==> [1, 0, 0], 0.1, # 1, [0, 1, 0], 0.7, # 2] [0, 0, 1]] 0.9] # # Then evaluate each label column vector # separately. bin_labels = label_binarize(truth_labels.flatten(), classes=list(range(num_classes))) assert (bin_labels.shape == torch.Size([num_samples, num_classes])) assert(raw_preds.shape == \ torch.Size([num_batches, batch_size, num_classes]) ) # Want straight down: logits for each class, for # each sample ('lst' for 'list'): raw_preds_lst = raw_preds.reshape([num_samples, num_classes]) assert (raw_preds_lst.shape == bin_labels.shape) # Turn logits into probs, rowise: preds = torch.softmax(raw_preds_lst, dim=1) # Place to hold the result dicts # from compute_binary_pr_curve() # for each of the classes. This # will be class-name : binary-result-dict all_curves_info = {} # Go through each column, class_id i.e. the # 1/0-vector label columns and preds # columns for one class at # a time, and get the prec/rec numbers. for col_idx in range(num_classes): bin_label_col = torch.tensor(bin_labels[:, col_idx]) preds_col = preds[:, col_idx] # Get all info for this single, binary # classification: list of 1/0 labels, and # list of floats, which are the preds for # the current class: #************** # # Using sklearn's precision_recall_curve, # # which determines thresholds by its own # # algorithm: # # from sklearn.metrics import precision_recall_curve # sklearn_precs,\ # sklearn_recs,\ # sklearn_thresholds = \ # precision_recall_curve(bin_label_col, preds_col) #************** # Obtain the information needed to # draw one PR curve: a CurveSpecification # instance: one_class_curve = cls.compute_binary_pr_curve( bin_label_col, preds_col, col_idx, # class_id thresholds) # Accumulate the curve indices # in a dict, keyed by class ID: all_curves_info[col_idx] = one_class_curve avg_precs = [ binary_curve_info['avg_prec'] for binary_curve_info in all_curves_info.values() ] mAP = np.mean(np.array(avg_precs)).tolist() return (all_curves_info, mAP)