def test_v1_4_0_deprecated_metrics(): from pytorch_lightning.metrics.functional.classification import stat_scores_multiple_classes with pytest.deprecated_call(match='will be removed in v1.4'): stat_scores_multiple_classes(pred=torch.tensor([0, 1]), target=torch.tensor([0, 1])) from pytorch_lightning.metrics.functional.classification import iou with pytest.deprecated_call(match='will be removed in v1.4'): iou(torch.randint(0, 2, (10, 3, 3)), torch.randint(0, 2, (10, 3, 3))) from pytorch_lightning.metrics.functional.classification import recall with pytest.deprecated_call(match='will be removed in v1.4'): recall(torch.randint(0, 2, (10, 3, 3)), torch.randint(0, 2, (10, 3, 3))) from pytorch_lightning.metrics.functional.classification import precision with pytest.deprecated_call(match='will be removed in v1.4'): precision(torch.randint(0, 2, (10, 3, 3)), torch.randint(0, 2, (10, 3, 3))) from pytorch_lightning.metrics.functional.classification import precision_recall with pytest.deprecated_call(match='will be removed in v1.4'): precision_recall(torch.randint(0, 2, (10, 3, 3)), torch.randint(0, 2, (10, 3, 3))) # Testing deprecation of class_reduction arg in the *new* precision from pytorch_lightning.metrics.functional import precision with pytest.deprecated_call(match='will be removed in v1.4'): precision(torch.randint(0, 2, (10, )), torch.randint(0, 2, (10, )), class_reduction='micro') # Testing deprecation of class_reduction arg in the *new* recall from pytorch_lightning.metrics.functional import recall with pytest.deprecated_call(match='will be removed in v1.4'): recall(torch.randint(0, 2, (10, )), torch.randint(0, 2, (10, )), class_reduction='micro') from pytorch_lightning.metrics.functional.classification import auc with pytest.deprecated_call(match='will be removed in v1.4'): auc(torch.rand(10, ).sort().values, torch.rand(10, )) from pytorch_lightning.metrics.functional.classification import auroc with pytest.deprecated_call(match='will be removed in v1.4'): auroc(torch.rand(10, ), torch.randint(0, 2, (10, ))) from pytorch_lightning.metrics.functional.classification import multiclass_auroc with pytest.deprecated_call(match='will be removed in v1.4'): multiclass_auroc(torch.rand(20, 5).softmax(dim=-1), torch.randint(0, 5, (20, )), num_classes=5) from pytorch_lightning.metrics.functional.classification import auc_decorator with pytest.deprecated_call(match='will be removed in v1.4'): auc_decorator() from pytorch_lightning.metrics.functional.classification import multiclass_auc_decorator with pytest.deprecated_call(match='will be removed in v1.4'): multiclass_auc_decorator()
def validation_epoch_end(self, outputs): """ After going through the entire validation set, we compute the final ROC curve accumulated overall predictions and target masks, then compute the AUROC """ if self.hparams.auroc: fpr, tpr, thresholds = self.roc.compute() fpr, idx = torch.sort(fpr, descending=False) tpr, thresholds = tpr[idx], thresholds[idx] auroc = auc(fpr, tpr) self.log('auroc', auroc)
def test_reorder_remove_in_v1_1(): with pytest.deprecated_call( match='The `reorder` parameter to `auc` has been deprecated'): _ = auc(torch.tensor([0, 1, 2, 3]), torch.tensor([0, 1, 2, 2]), reorder=True)
def test_epoch_end(self, outputs): if self.incorrect_type != 'boundary': ##### Confusion Matrix ##### conf_mtx = confusion_matrix( torch.cat([b['preds'] for b in outputs]), torch.cat([b['labels'] for b in outputs]), normalize=False, num_classes=5) ##### Normalized Confusion Matrix ##### conf_mtx_normalized = confusion_matrix( torch.cat([b['preds'] for b in outputs]), torch.cat([b['labels'] for b in outputs]), normalize=True, num_classes=5) ##### Weighted Confusion Matrix ##### conf_mtx_weighted = conf_mtx.clone() for c, w in enumerate(self.weights): conf_mtx_weighted[c, :] *= w ##### ACCURACY ##### accuracy = torch.diag(conf_mtx).sum() / conf_mtx.sum() accuracy_weighted = torch.diag( conf_mtx_weighted).sum() / conf_mtx_weighted.sum() ##### AUC_SCORE ##### roc_results = multiclass_roc( torch.cat([b['logits'] for b in outputs]), torch.cat([b['labels'] for b in outputs]), num_classes=5) AUROC_str = '' AUROC_list = {} for cls, roc_cls in enumerate(roc_results): fpr, tpr, threshold = roc_cls self.logger.experiment.add_scalar(f'val_AUC[{cls}]', auc(fpr, tpr), self.current_epoch) AUROC_str += '\tAUC_SCORE[CLS %d]: \t%.4f\n' % (cls, auc(fpr, tpr)) AUROC_list['AUC_SCORE[CLS %d]' % cls] = auc(fpr, tpr) ##### F1 ##### f1_score = f1(torch.cat([b['preds'] for b in outputs]), torch.cat([b['labels'] for b in outputs]), num_classes=5) ##### Average Precision ##### # TO DO ##### PRINT RESULTS ##### print('=' * 100) print( f'[MODEL NAME]: {self.model_name} \t [INCORRECT TYPE]: {self.incorrect_type}' ) print('RESULTS:') print('\tAccuracy: \t\t%.4f' % accuracy) print('\tWeighted Accuracy: \t%.4f' % accuracy_weighted) print('\tF1 Score: \t\t%.4f' % f1_score) print(AUROC_str) self.metrics_result[self.incorrect_type][self.model_name] = { 'Accuracy': round(float(accuracy), 4), 'Weighted Accuracy': round(float(accuracy_weighted), 4), 'F1_score': round(float(f1_score), 4) } for key, val in AUROC_list.items(): self.metrics_result[self.incorrect_type][ self.model_name].update({key: round(float(val), 4)}) print('Confusion Matrix') fig, ax = plt.subplots(figsize=(4, 4)) sn.heatmap(conf_mtx.cpu(), annot=True, cbar=False, annot_kws={"size": 15}, fmt='g', cmap='mako') plt.show() fig, ax = plt.subplots(figsize=(4, 4)) sn.heatmap(conf_mtx_normalized.cpu(), annot=True, cbar=False, annot_kws={"size": 12}, fmt='.2f', cmap='mako') plt.show() print('=' * 100) else: tol_correct = 0 tol_samples = 0 tol_drop = 0 for batch in outputs: preds = batch['preds'] labels = batch['labels'] slope_id = batch['doc_ids'] ##### Change lizhong's code #### for idx, slop_idx in enumerate(slope_id): agree_by_user = bool( slope_df[slope_df['slope_id'] == slop_idx.item()] ['sentiment_correct'].values[0]) possible_classes = slope_df[ slope_df['slope_id'] == slop_idx.item()]['label_from_score'].values[0] pred_class = preds[idx] # difference between pred and true label diff = torch.abs(pred_class - possible_classes) # if correct label if agree_by_user: # True if diff == 0: # correct prediction tol_correct += 1 tol_samples += 1 elif diff == 1: # discard tol_drop += 1 else: # wrong prediction tol_samples += 1 # if incorrect label else: # False if diff == 0: # wrong tol_samples += 1 elif diff == 1: # discard tol_drop += 1 else: # Correct tol_correct += 1 tol_samples += 1 boundary_accuracy = round(tol_correct / tol_samples, 4) self.metrics_result[self.incorrect_type][self.model_name] = {} self.metrics_result[self.incorrect_type][ self.model_name]['boundary_acc'] = boundary_accuracy self.metrics_result[self.incorrect_type][ self.model_name]['total_drop_sample'] = tol_drop print('=' * 100) print( f'[MODEL NAME]: {self.model_name} \t [INCORRECT TYPE]: {self.incorrect_type}' ) print('\tBoundary Accuracy: \t\t%.4f' % boundary_accuracy) print('\tDrop Total Sample: \t\t%.4f' % tol_drop)
def test_auc(x, y, expected): # Test Area Under Curve (AUC) computation assert auc(torch.tensor(x), torch.tensor(y)) == expected
def compute(self) -> torch.Tensor: preds, targets = self._get_preds_and_targets() if torch.unique(targets).numel() == 1: return torch.tensor(np.nan) prec, recall, _ = precision_recall_curve(preds, targets) return auc(recall, prec)
def test_epoch_end(self, outputs): """ After going through the entire test set, we compute the final ROC curve accumulated overall predictions and target masks, then compute the AUROC """ if self.hparams.auroc: # Compute ROC, then compute AUROC and log the value for the whole test set fpr, tpr, thresholds = self.roc.compute() fpr, idx = torch.sort(fpr, descending=False) tpr, thresholds = tpr[idx], thresholds[idx] auroc = auc(fpr, tpr) self.log('auroc_test', auroc) # Divide thresholds from ROC into 100 equally separated thresholds step_size = int(len(thresholds)/100) thresholds = thresholds[::step_size] # Find best best threshold based off of best IOU best_iou = 0 best_threshold = -1 # For each threshold, compute IOU for whole test set for i, threshold in enumerate(thresholds): test_dataloader = self.trainer.datamodule.test_dataloader()[1] ious = [] for batch_idx, (x, y) in enumerate(test_dataloader): x, y = x.to(self.device), y.to(self.device) x_rec, M, colormaps = self.forward(x) bloc_map = self.gen_bloc_map(M, threshold) iou_score = iou(bloc_map, y) ious.append(iou_score.detach().cpu().item()) avg_iou = np.mean(ious) if avg_iou > best_iou: best_iou = avg_iou best_threshold = threshold self.trainer.logger.experiment.add_scalar('avg_iou', avg_iou, i) self.trainer.logger.experiment.add_scalar('threshold', threshold, i) # Log best iou and threshold self.log('best_iou', best_iou) self.log('best_threshold', best_threshold) # Now, using best threshold, generate the binary localization maps for # all images in the test set and log/save them for batch_idx, (x, y) in enumerate(test_dataloader): x, y = x.to(self.device), y.to(self.device) x_rec, M, colormaps = self.forward(x) bloc_map = self.gen_bloc_map(M, best_threshold) # Save the binary localization maps bloc_map = bloc_map.detach().cpu() bloc_map_grid = make_grid(bloc_map).float() save_image(bloc_map_grid, f'{self.trainer.logger.log_dir}/batch{batch_idx}-blocmaps.png') self.trainer.logger.experiment.add_image('blocmaps', bloc_map_grid.numpy(), batch_idx) # Save the input images x = x.detach().cpu() x = self.trainer.datamodule.unnormalize_batch(x) x_grid = make_grid(x).float() save_image(x_grid, f'{self.trainer.logger.log_dir}/batch{batch_idx}-input.png') self.trainer.logger.experiment.add_image('input', x_grid.numpy(), batch_idx) # Save teh target masks y = y.detach().cpu() y_grid = make_grid(y).float() save_image(y_grid, f'{self.trainer.logger.log_dir}/batch{batch_idx}-targets.png') self.trainer.logger.experiment.add_image('targets', y_grid.numpy(), batch_idx)