def construct_toy_single_label_dataset(length): BaseDataset.CLASSES = ('foo', 'bar') BaseDataset.__getitem__ = MagicMock(side_effect=lambda idx: idx) dataset = BaseDataset(data_prefix='', pipeline=[], test_mode=True) cat_ids_list = [[np.random.randint(0, 80)] for _ in range(length)] dataset.data_infos = MagicMock() dataset.data_infos.__len__.return_value = length dataset.get_cat_ids = MagicMock(side_effect=lambda idx: cat_ids_list[idx]) return dataset, cat_ids_list
def test_dataset_evaluation(): # test multi-class single-label evaluation dataset = BaseDataset(data_prefix='', pipeline=[], test_mode=True) dataset.data_infos = [ dict(gt_label=0), dict(gt_label=0), dict(gt_label=1), dict(gt_label=2), dict(gt_label=1), dict(gt_label=0) ] fake_results = np.array([[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0, 1]]) eval_results = dataset.evaluate(fake_results, metric=['precision', 'recall', 'f1_score']) assert eval_results['precision'] == pytest.approx( (1 + 1 + 1 / 3) / 3 * 100.0) assert eval_results['recall'] == pytest.approx( (2 / 3 + 1 / 2 + 1) / 3 * 100.0) assert eval_results['f1_score'] == pytest.approx( (4 / 5 + 2 / 3 + 1 / 2) / 3 * 100.0) # test multi-label evalutation dataset = MultiLabelDataset(data_prefix='', pipeline=[], test_mode=True) dataset.data_infos = [ dict(gt_label=[1, 1, 0, -1]), dict(gt_label=[1, 1, 0, -1]), dict(gt_label=[0, -1, 1, -1]), dict(gt_label=[0, 1, 0, -1]), dict(gt_label=[0, 1, 0, -1]), ] fake_results = np.array([[0.9, 0.8, 0.3, 0.2], [0.1, 0.2, 0.2, 0.1], [0.7, 0.5, 0.9, 0.3], [0.8, 0.1, 0.1, 0.2], [0.8, 0.1, 0.1, 0.2]]) # the metric must be valid with pytest.raises(KeyError): metric = 'coverage' dataset.evaluate(fake_results, metric=metric) # only one metric metric = 'mAP' eval_results = dataset.evaluate(fake_results, metric=metric) assert 'mAP' in eval_results.keys() assert 'CP' not in eval_results.keys() # multiple metrics metric = ['mAP', 'CR', 'OF1'] eval_results = dataset.evaluate(fake_results, metric=metric) assert 'mAP' in eval_results.keys() assert 'CR' in eval_results.keys() assert 'OF1' in eval_results.keys() assert 'CF1' not in eval_results.keys()
def construct_toy_multi_label_dataset(length): BaseDataset.CLASSES = ('foo', 'bar') BaseDataset.__getitem__ = MagicMock(side_effect=lambda idx: idx) dataset = BaseDataset(data_prefix='', pipeline=[], test_mode=True) cat_ids_list = [ np.random.randint(0, 80, num).tolist() for num in np.random.randint(1, 20, length) ] dataset.data_infos = MagicMock() dataset.data_infos.__len__.return_value = length dataset.get_cat_ids = MagicMock(side_effect=lambda idx: cat_ids_list[idx]) dataset.evaluate = MagicMock(side_effect=mock_evaluate) return dataset, cat_ids_list
def test_dataset_evaluation(): dataset = BaseDataset(data_prefix='', pipeline=[], test_mode=True) dataset.data_infos = [ dict(gt_label=0), dict(gt_label=1), dict(gt_label=2), dict(gt_label=1) ] fake_results = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) eval_results = dataset.evaluate(fake_results, metric=['precision', 'recall', 'f1_score']) assert eval_results['precision'] == pytest.approx( (1 + 1 + 1 / 2) / 3 * 100.0) assert eval_results['recall'] == pytest.approx((1 + 1 / 2 + 1) / 3 * 100.0) assert eval_results['f1_score'] == pytest.approx( (1 + 2 / 3 + 2 / 3) / 3 * 100.0)
def test_dataset_wrapper(): BaseDataset.CLASSES = ('foo', 'bar') BaseDataset.__getitem__ = MagicMock(side_effect=lambda idx: idx) dataset_a = BaseDataset(data_prefix='', pipeline=[], test_mode=True) len_a = 10 cat_ids_list_a = [ np.random.randint(0, 80, num).tolist() for num in np.random.randint(1, 20, len_a) ] dataset_a.data_infos = MagicMock() dataset_a.data_infos.__len__.return_value = len_a dataset_a.get_cat_ids = MagicMock( side_effect=lambda idx: cat_ids_list_a[idx]) dataset_b = BaseDataset(data_prefix='', pipeline=[], test_mode=True) len_b = 20 cat_ids_list_b = [ np.random.randint(0, 80, num).tolist() for num in np.random.randint(1, 20, len_b) ] dataset_b.data_infos = MagicMock() dataset_b.data_infos.__len__.return_value = len_b dataset_b.get_cat_ids = MagicMock( side_effect=lambda idx: cat_ids_list_b[idx]) concat_dataset = ConcatDataset([dataset_a, dataset_b]) assert concat_dataset[5] == 5 assert concat_dataset[25] == 15 assert concat_dataset.get_cat_ids(5) == cat_ids_list_a[5] assert concat_dataset.get_cat_ids(25) == cat_ids_list_b[15] assert len(concat_dataset) == len(dataset_a) + len(dataset_b) assert concat_dataset.CLASSES == BaseDataset.CLASSES repeat_dataset = RepeatDataset(dataset_a, 10) assert repeat_dataset[5] == 5 assert repeat_dataset[15] == 5 assert repeat_dataset[27] == 7 assert repeat_dataset.get_cat_ids(5) == cat_ids_list_a[5] assert repeat_dataset.get_cat_ids(15) == cat_ids_list_a[5] assert repeat_dataset.get_cat_ids(27) == cat_ids_list_a[7] assert len(repeat_dataset) == 10 * len(dataset_a) assert repeat_dataset.CLASSES == BaseDataset.CLASSES category_freq = defaultdict(int) for cat_ids in cat_ids_list_a: cat_ids = set(cat_ids) for cat_id in cat_ids: category_freq[cat_id] += 1 for k, v in category_freq.items(): category_freq[k] = v / len(cat_ids_list_a) mean_freq = np.mean(list(category_freq.values())) repeat_thr = mean_freq category_repeat = { cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq)) for cat_id, cat_freq in category_freq.items() } repeat_factors = [] for cat_ids in cat_ids_list_a: cat_ids = set(cat_ids) repeat_factor = max({category_repeat[cat_id] for cat_id in cat_ids}) repeat_factors.append(math.ceil(repeat_factor)) repeat_factors_cumsum = np.cumsum(repeat_factors) repeat_factor_dataset = ClassBalancedDataset(dataset_a, repeat_thr) assert repeat_factor_dataset.CLASSES == BaseDataset.CLASSES assert len(repeat_factor_dataset) == repeat_factors_cumsum[-1] for idx in np.random.randint(0, len(repeat_factor_dataset), 3): assert repeat_factor_dataset[idx] == bisect.bisect_right( repeat_factors_cumsum, idx)
def test_dataset_evaluation(): # test multi-class single-label evaluation dataset = BaseDataset(data_prefix='', pipeline=[], test_mode=True) dataset.data_infos = [ dict(gt_label=0), dict(gt_label=0), dict(gt_label=1), dict(gt_label=2), dict(gt_label=1), dict(gt_label=0) ] fake_results = np.array([[0.7, 0, 0.3], [0.5, 0.2, 0.3], [0.4, 0.5, 0.1], [0, 0, 1], [0, 0, 1], [0, 0, 1]]) eval_results = dataset.evaluate( fake_results, metric=['precision', 'recall', 'f1_score', 'support', 'accuracy'], metric_options={'topk': 1}) assert eval_results['precision'] == pytest.approx( (1 + 1 + 1 / 3) / 3 * 100.0) assert eval_results['recall'] == pytest.approx( (2 / 3 + 1 / 2 + 1) / 3 * 100.0) assert eval_results['f1_score'] == pytest.approx( (4 / 5 + 2 / 3 + 1 / 2) / 3 * 100.0) assert eval_results['support'] == 6 assert eval_results['accuracy'] == pytest.approx(4 / 6 * 100) # test input as tensor fake_results_tensor = torch.from_numpy(fake_results) eval_results_ = dataset.evaluate( fake_results_tensor, metric=['precision', 'recall', 'f1_score', 'support', 'accuracy'], metric_options={'topk': 1}) assert eval_results_ == eval_results # test thr eval_results = dataset.evaluate( fake_results, metric=['precision', 'recall', 'f1_score', 'accuracy'], metric_options={ 'thrs': 0.6, 'topk': 1 }) assert eval_results['precision'] == pytest.approx( (1 + 0 + 1 / 3) / 3 * 100.0) assert eval_results['recall'] == pytest.approx((1 / 3 + 0 + 1) / 3 * 100.0) assert eval_results['f1_score'] == pytest.approx( (1 / 2 + 0 + 1 / 2) / 3 * 100.0) assert eval_results['accuracy'] == pytest.approx(2 / 6 * 100) # thrs must be a float, tuple or None with pytest.raises(TypeError): eval_results = dataset.evaluate( fake_results, metric=['precision', 'recall', 'f1_score', 'accuracy'], metric_options={ 'thrs': 'thr', 'topk': 1 }) # test topk and thr as tuple eval_results = dataset.evaluate( fake_results, metric=['precision', 'recall', 'f1_score', 'accuracy'], metric_options={ 'thrs': (0.5, 0.6), 'topk': (1, 2) }) assert { 'precision_thr_0.50', 'precision_thr_0.60', 'recall_thr_0.50', 'recall_thr_0.60', 'f1_score_thr_0.50', 'f1_score_thr_0.60', 'accuracy_top-1_thr_0.50', 'accuracy_top-1_thr_0.60', 'accuracy_top-2_thr_0.50', 'accuracy_top-2_thr_0.60' } == eval_results.keys() assert type(eval_results['precision_thr_0.50']) == float assert type(eval_results['recall_thr_0.50']) == float assert type(eval_results['f1_score_thr_0.50']) == float assert type(eval_results['accuracy_top-1_thr_0.50']) == float eval_results = dataset.evaluate(fake_results, metric='accuracy', metric_options={ 'thrs': 0.5, 'topk': (1, 2) }) assert {'accuracy_top-1', 'accuracy_top-2'} == eval_results.keys() assert type(eval_results['accuracy_top-1']) == float eval_results = dataset.evaluate(fake_results, metric='accuracy', metric_options={ 'thrs': (0.5, 0.6), 'topk': 1 }) assert {'accuracy_thr_0.50', 'accuracy_thr_0.60'} == eval_results.keys() assert type(eval_results['accuracy_thr_0.50']) == float # test evaluation results for classes eval_results = dataset.evaluate( fake_results, metric=['precision', 'recall', 'f1_score', 'support'], metric_options={'average_mode': 'none'}) assert eval_results['precision'].shape == (3, ) assert eval_results['recall'].shape == (3, ) assert eval_results['f1_score'].shape == (3, ) assert eval_results['support'].shape == (3, ) # the average_mode method must be valid with pytest.raises(ValueError): eval_results = dataset.evaluate( fake_results, metric='precision', metric_options={'average_mode': 'micro'}) with pytest.raises(ValueError): eval_results = dataset.evaluate( fake_results, metric='recall', metric_options={'average_mode': 'micro'}) with pytest.raises(ValueError): eval_results = dataset.evaluate( fake_results, metric='f1_score', metric_options={'average_mode': 'micro'}) with pytest.raises(ValueError): eval_results = dataset.evaluate( fake_results, metric='support', metric_options={'average_mode': 'micro'}) # the metric must be valid for the dataset with pytest.raises(ValueError): eval_results = dataset.evaluate(fake_results, metric='map') # test multi-label evalutation dataset = MultiLabelDataset(data_prefix='', pipeline=[], test_mode=True) dataset.data_infos = [ dict(gt_label=[1, 1, 0, -1]), dict(gt_label=[1, 1, 0, -1]), dict(gt_label=[0, -1, 1, -1]), dict(gt_label=[0, 1, 0, -1]), dict(gt_label=[0, 1, 0, -1]), ] fake_results = np.array([[0.9, 0.8, 0.3, 0.2], [0.1, 0.2, 0.2, 0.1], [0.7, 0.5, 0.9, 0.3], [0.8, 0.1, 0.1, 0.2], [0.8, 0.1, 0.1, 0.2]]) # the metric must be valid with pytest.raises(ValueError): metric = 'coverage' dataset.evaluate(fake_results, metric=metric) # only one metric metric = 'mAP' eval_results = dataset.evaluate(fake_results, metric=metric) assert 'mAP' in eval_results.keys() assert 'CP' not in eval_results.keys() # multiple metrics metric = ['mAP', 'CR', 'OF1'] eval_results = dataset.evaluate(fake_results, metric=metric) assert 'mAP' in eval_results.keys() assert 'CR' in eval_results.keys() assert 'OF1' in eval_results.keys() assert 'CF1' not in eval_results.keys()