def test_f1_multi_class_macro(self):
        bi_sequence_len_idx = 1
        batch_input_key = 'input'
        model_output_key = None
        batch_target_key = 'target'
        end_padded = True
        wrapped_evaluator = evaluators.MultiClassF1Evaluator(
            model_output_key=model_output_key,
            batch_target_key=batch_target_key,
            average='macro')

        evaluator = evaluators.SequenceLabelingEvaluatorWrapper(
            evaluator=wrapped_evaluator,
            batch_input_sequence_length_idx=bi_sequence_len_idx,
            batch_input_key=batch_input_key,
            model_output_key=model_output_key,
            batch_target_key=batch_target_key,
            end_padded=end_padded)

        output = torch.tensor([[[0.5, 0.1, 0.4], [0.3, 0.3, 0.4]],
                               [[0.6, 0.4, 0.0], [-2., -2., -2.]]],
                              dtype=torch.float32)
        batch = {
            'target': torch.tensor([[0, 2], [2, -1]], dtype=torch.float32),
            'input': [None, torch.tensor([2, 1], dtype=torch.int)]
        }
        evaluator.step(output, batch)

        output = torch.tensor([[[0.1, 0.1, 0.8]]], dtype=torch.float32)
        batch = {
            'target': torch.tensor([[2]], dtype=torch.float32),
            'input': [None, torch.tensor([1], dtype=torch.int)]
        }
        evaluator.step(output, batch)

        res = evaluator.calculate()

        correct = metrics.f1_score(y_pred=np.array([0, 2, 0, 2]),
                                   y_true=np.array([0, 2, 2, 2]),
                                   average='macro')

        self.assertAlmostEqual(res.score, correct)
    def test_correct_score_calculation_micro(self):
        evaluator = evaluators.MultiClassF1Evaluator(model_output_key=None,
                                                     batch_target_key='target',
                                                     average='micro')

        output = torch.tensor(
            [[0.5, 0.1, 0.4], [0.3, 0.3, 0.4], [0.5, 0.5, 0.0]],
            dtype=torch.float32)
        batch = {'target': torch.tensor([0, 2, 2], dtype=torch.float32)}
        evaluator.step(output, batch)

        output = torch.tensor([[0.1, 0.1, 0.8]], dtype=torch.float32)
        batch = {'target': torch.tensor([2], dtype=torch.float32)}
        evaluator.step(output, batch)

        res = evaluator.calculate()

        correct = metrics.f1_score(y_pred=np.array([0, 2, 0, 2]),
                                   y_true=np.array([0, 2, 2, 2]),
                                   average='micro')

        self.assertAlmostEqual(res.score, correct)