Пример #1
0
    def __call__(self, inputs, outputs):
        words, label_gts, label_preds = normalized_words_labels_preds(
            inputs, outputs, self.tokenizer)
        acc = accuracy_score(label_preds, label_gts)
        f1 = f1_score(label_preds, label_gts)
        precision = precision_score(label_preds, label_gts)
        recall = recall_score(label_preds, label_gts)

        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
Пример #2
0
def acc_and_f1(preds, labels):
    acc = accuracy_score(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    p = precision_score(y_true=labels, y_pred=preds)
    r = recall_score(y_true=labels, y_pred=preds)
    report = classification_report(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
        "precision:": p,
        "recall": r,
        # "report": report,
    }
Пример #3
0
 def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
     """Reduces aggregated logs over validation steps."""
     label_class = aggregated_logs['label_class']
     predict_class = aggregated_logs['predict_class']
     return {
         'f1':
         seqeval_metrics.f1_score(label_class, predict_class),
         'precision':
         seqeval_metrics.precision_score(label_class, predict_class),
         'recall':
         seqeval_metrics.recall_score(label_class, predict_class),
         'accuracy':
         seqeval_metrics.accuracy_score(label_class, predict_class),
     }
Пример #4
0
def results_as_entities(out_label_list, preds_list, results_abs_path):
    results = {
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
        "accuracy": accuracy_score(out_label_list, preds_list),
        "report:": classification_report(out_label_list, preds_list, digits=2)
    }

    with open(results_abs_path, "w") as writer:
        for key in sorted(results.keys()):
            writer.write("{} = {}\n".format(key, str(results[key])))

    return results
Пример #5
0
    def infer(self):
        num_test_batch = int(self.test_size / self.batch_size) + 1

        test_feed_dict = {
            self.char_input: self.test_char_idx,
            self.seq_len: self.test_len,
            self.label: self.test_Y
        }

        modelpath = f"./ner-model-{self.test_type}/"
        modelName = f"ner-{self.test_type}.ckpt"
        best_f1_score = 0.
        saver = tf.train.Saver()

        with tf.Session(config=config) as sess:

            sess.run(tf.global_variables_initializer())
            ## Load pretrained model
            ckpt = tf.train.get_checkpoint_state(modelpath)
            if (ckpt and tf.train.checkpoint_exists(
                    ckpt.model_checkpoint_path)):
                saver.restore(sess, modelpath + modelName)
                print("Model loaded!")

            start_time = datetime.datetime.now()
            test_f1_list, test_recall_list, test_precision_list = [], [], []

            print(self.data.label2idx)

            sess.run(self.test_init_op, feed_dict=test_feed_dict)
            y_true_list, y_pred_list, test_loss = [], [], 0.
            for step in range(num_test_batch):
                loss = sess.run(self.loss, feed_dict={self.dropout: 0.0})
                y_true, y_pred = self.predict(sess)
                y_true_list += y_true
                y_pred_list += y_pred
                test_loss += loss / num_test_batch

            test_f1_score = f1_score(y_true_list, y_pred_list)
            test_recall_score = recall_score(y_true_list, y_pred_list)
            test_precision_score = precision_score(y_true_list, y_pred_list)

            print(
                "[test] loss {:.6f} f1 {:.4f} recall {:.4f} precision {:.4f}".
                format(test_loss, test_f1_score, test_recall_score,
                       test_precision_score))
            print(classification_report(y_true_list, y_pred_list, digits=4))

            elapsed_time = datetime.datetime.now() - start_time
            print("{}".format(elapsed_time))
Пример #6
0
def eval_seq_scores(y_true, y_pred):
    """ Performs sequence evaluation on slot labels
        Args:
            y_true: true slot labels
            y_pred: predicted slot labels
        Returns:
            scores: dict containing the evaluation scores: f1, accuracy, precision, recall
    """
    scores = dict()
    scores['f1'] = f1_score(y_true, y_pred)
    scores['accuracy'] = accuracy_score(y_true, y_pred)
    scores['precision'] = precision_score(y_true, y_pred)
    scores['recall'] = recall_score(y_true, y_pred)
    return scores
Пример #7
0
 def metrics_fn(logits, labels):
     preds = np.argmax(logits, axis=-1)
     label_names = self.get_labels()
     y_true = []
     y_pred = []
     for pred, label in zip(preds, labels):
         y_true.append([label_names[l] for l in label if l >= 0])
         y_pred.append(
             [label_names[p] for p, l in zip(pred, label) if l >= 0])
     return OrderedDict(
         accuracy=seq_metrics.accuracy_score(y_true, y_pred),
         f1=seq_metrics.f1_score(y_true, y_pred),
         precision=seq_metrics.precision_score(y_true, y_pred),
         recall=seq_metrics.recall_score(y_true, y_pred))
Пример #8
0
def calculate_token_class_metrics(pred_toks, targ_toks, metric_key):
    if metric_key == "accuracy":
        return seq_metrics.accuracy_score(targ_toks, pred_toks)

    if metric_key == "precision":
        return seq_metrics.precision_score(targ_toks, pred_toks)

    if metric_key == "recall":
        return seq_metrics.recall_score(targ_toks, pred_toks)

    if metric_key == "f1":
        return seq_metrics.f1_score(targ_toks, pred_toks)

    if metric_key == "classification_report":
        return seq_metrics.classification_report(targ_toks, pred_toks)
Пример #9
0
def compute_f1(predictions, correct, idx2Label):
    label_pred = []
    for sentence in predictions:
        label_pred.append([idx2Label[element] for element in sentence])

    label_correct = []
    for sentence in correct:
        label_correct.append([idx2Label[element] for element in sentence])

    print(classification_report(label_correct, label_pred))
    # print("predictions ", len(label_pred))
    # print("correct labels ", len(label_correct)

    return precision_score(label_correct, label_pred), recall_score(
        label_correct, label_pred), f1_score(label_correct, label_pred)
Пример #10
0
 def score(self, y_true, y_pred):
     """Calculate f1 score.
     Args:
         y_true (list): true sequences.
         y_pred (list): predicted sequences.
     Returns:
         score: f1 score.
     """
     f1 = f1_score(y_true, y_pred)
     precision = precision_score(y_true, y_pred)
     recall = recall_score(y_true, y_pred)
     if self._digits:
         self._logger.info(
             classification_report(y_true, y_pred, digits=self._digits))
     return f1, precision, recall
Пример #11
0
def model_evaluate(config, model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    label_map = {i: label for i, label in enumerate(config.class_list)}
    criterion = FocalLoss(gamma=2, alpha=1)
    with torch.no_grad():
        for i, (input_ids, attention_mask, token_type_ids,
                labels) in enumerate(data_iter):

            input_ids = torch.tensor(input_ids).type(torch.LongTensor).to(
                config.device)
            attention_mask = torch.tensor(attention_mask).type(
                torch.LongTensor).to(config.device)
            token_type_ids = torch.tensor(token_type_ids).type(
                torch.LongTensor).to(config.device)
            labels = torch.tensor(labels).type(torch.LongTensor).to(
                config.device)

            outputs = model(input_ids, attention_mask, token_type_ids)

            active_loss = attention_mask.view(-1) == 1
            active_logits = outputs.view(-1, config.num_labels)[active_loss]
            active_labels = labels.view(-1)[active_loss]

            #loss = F.cross_entropy(active_logits, active_labels)
            loss = criterion(active_logits, active_labels)
            loss_total += loss
            active_labels = active_labels.data.cpu().numpy()
            predic = torch.max(active_logits.data, 1)[1].cpu().numpy()

            labels_all = np.append(labels_all, active_labels)
            predict_all = np.append(predict_all, predic)

    true_label = [label_map[key] for key in labels_all]
    predict_label = [label_map[key] for key in predict_all]

    acc = metrics.accuracy_score(labels_all, predict_all)
    precision = precision_score(true_label, predict_label)
    recall = recall_score(true_label, predict_label)
    f1 = f1_score(true_label, predict_label)
    if test:
        report = classification_report(true_label, predict_label, digits=4)
        confusion = metrics.confusion_matrix(true_label, predict_label)
        return acc, precision, recall, f1, loss_total / len(
            data_iter), report, confusion
    return acc, precision, recall, f1, loss_total / len(data_iter)
Пример #12
0
    def get_metric(self, reset: bool) -> Union[float, Tuple[float, ...], Dict[str, float], Dict[str, List[float]]]:
        if not reset:
            return dict()

        if not self._predictions:
            return dict()

        metrics = {
            'seqeval_precision': precision_score(self._gold_labels, self._predictions),
            'seqeval_recall': recall_score(self._gold_labels, self._predictions),
            'seqeval_f1_score': f1_score(self._gold_labels, self._predictions)
        }

        self.reset()

        return metrics
Пример #13
0
    def test_by_ground_truth(self):
        with open(self.file_name) as f:
            output = subprocess.check_output(['perl', 'conlleval.pl'],
                                             stdin=f).decode('utf-8')
            acc_true, p_true, r_true, f1_true = self.parse_conlleval_output(
                output)

            acc_pred = accuracy_score(self.y_true, self.y_pred)
            p_pred = precision_score(self.y_true, self.y_pred)
            r_pred = recall_score(self.y_true, self.y_pred)
            f1_pred = f1_score(self.y_true, self.y_pred)

            self.assertLess(abs(acc_pred - acc_true), 1e-4)
            self.assertLess(abs(p_pred - p_true), 1e-4)
            self.assertLess(abs(r_pred - r_true), 1e-4)
            self.assertLess(abs(f1_pred - f1_true), 1e-4)
Пример #14
0
    def calculate_report(self, y, y_, transform=True):
        '''
        calculating F1, P, R

        :param y: golden label, list
        :param y_: model output, list
        :return:
        '''
        if transform:
            for i in range(len(y)):
                for j in range(len(y[i])):
                    y[i][j] = self.voc_i2s[y[i][j]]
            for i in range(len(y_)):
                for j in range(len(y_[i])):
                    y_[i][j] = self.voc_i2s[y_[i][j]]
        return precision_score(y, y_), recall_score(y, y_), f1_score(y, y_)
Пример #15
0
def pos_evaluate_word_PRF(y_pred, y):
    #dict = {'E': 2, 'S': 3, 'B':0, 'I':1}
    y_word = []
    y_pos = []
    y_pred_word = []
    y_pred_pos = []
    for y_label, y_pred_label in zip(y, y_pred):
        y_word.append(y_label[0])
        y_pos.append(y_label[2:])
        y_pred_word.append(y_pred_label[0])
        y_pred_pos.append(y_pred_label[2:])

    word_cor_num = 0
    pos_cor_num = 0
    yp_wordnum = y_pred_word.count('E')+y_pred_word.count('S')
    yt_wordnum = y_word.count('E')+y_word.count('S')
    start = 0
    for i in range(len(y_word)):
        if y_word[i] == 'E' or y_word[i] == 'S':
            word_flag = True
            pos_flag = True
            for j in range(start, i+1):
                if y_word[j] != y_pred_word[j]:
                    word_flag = False
                    pos_flag = False
                    break
                if y_pos[j] != y_pred_pos[j]:
                    pos_flag = False
            if word_flag:
                word_cor_num += 1
            if pos_flag:
                pos_cor_num += 1
            start = i+1

    wP = word_cor_num / float(yp_wordnum) if yp_wordnum > 0 else -1
    wR = word_cor_num / float(yt_wordnum) if yt_wordnum > 0 else -1
    wF = 2 * wP * wR / (wP + wR)

    # pP = pos_cor_num / float(yp_wordnum) if yp_wordnum > 0 else -1
    # pR = pos_cor_num / float(yt_wordnum) if yt_wordnum > 0 else -1
    # pF = 2 * pP * pR / (pP + pR)

    pP = precision_score(y, y_pred)
    pR = recall_score(y, y_pred)
    pF = f1_score(y, y_pred)

    return (wP, wR, wF), (pP, pR, pF)
Пример #16
0
    def get_results(self, y_true,y_pred):
        result_med = collections.namedtuple(
                                        'result_med',
                                            [
                                                'precision',
                                                'recall',
                                                'f1_score',
                                                'accuracy',
                                        ]
                                )
        
        result_med.accuracy = accuracy_score(y_true,y_pred)
        result_med.precision = precision_score(y_true,y_pred)
        result_med.recall = recall_score(y_true,y_pred)
        result_med.f1_score = f1_score(y_true,y_pred)

        return result_med
Пример #17
0
def run(filename):
    print('------------------       %s      ---------------' % filename)
    with open(filename, 'r') as f:
        obj = json.load(f)

    y_true = obj['gold_results']
    y_pred = obj['pred_results']

    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    df = pd.DataFrame({'percision': [pre], 'recall': [rec], 'f1': [f1]})
    print(df)

    report = classification_report(y_true, y_pred)
    print(report)
Пример #18
0
    def on_epoch_end(self, epoch, logs=None):
        pred_probs = self.model.predict(self.valid_features)
        if self.preprocessor.use_bert:
            pred_probs = pred_probs[:, 1:-1, :]  # remove <CLS> and <SEQ>
        y_pred = self.preprocessor.label_decode(pred_probs,
                                                self.get_lengths(pred_probs))

        r = metrics.recall_score(self.valid_labels, y_pred)
        p = metrics.precision_score(self.valid_labels, y_pred)
        f1 = metrics.f1_score(self.valid_labels, y_pred)

        logs['val_r'] = r
        logs['val_p'] = p
        logs['val_f1'] = f1
        print('Epoch {}: val_r: {}, val_p: {}, val_f1: {}'.format(
            epoch + 1, r, p, f1))
        print(metrics.classification_report(self.valid_labels, y_pred))
Пример #19
0
    def compute_metrics(self) -> Dict:
        # TODO 这里有个很神奇的事情,不传zero_division,即使用默认值0的时候,这里的运算会慢很多,待进一步分析
        logger.info(
            f"\n{classification_report(self.out_label_list, self.preds_list, zero_division=0)}"
        )

        return {
            "precision":
            precision_score(self.out_label_list,
                            self.preds_list,
                            zero_division=0),
            "recall":
            recall_score(self.out_label_list, self.preds_list,
                         zero_division=0),
            "f1":
            f1_score(self.out_label_list, self.preds_list, zero_division=0),
        }
def evaluate_results(net, test_loader, pad_id, cuda):
    logger.info("Evaluating test samples...")
    acc = 0
    out_labels = []
    true_labels = []
    net.eval()
    with torch.no_grad():
        for i, data in tqdm(enumerate(test_loader), total=len(test_loader)):
            x, e1_e2_start, labels, _, _, _ = data
            attention_mask = (x != pad_id).float()
            token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long()

            if cuda:
                x = x.cuda()
                labels = labels.cuda()
                attention_mask = attention_mask.cuda()
                token_type_ids = token_type_ids.cuda()

            classification_logits = net(
                x,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                Q=None,
                e1_e2_start=e1_e2_start,
            )

            accuracy, (o, l) = evaluate_(classification_logits,
                                         labels,
                                         ignore_idx=-1)
            out_labels.append([str(i) for i in o])
            true_labels.append([str(i) for i in l])
            acc += accuracy

    accuracy = acc / (i + 1)
    results = {
        "accuracy": accuracy,
        "precision": precision_score(true_labels, out_labels),
        "recall": recall_score(true_labels, out_labels),
        "f1": f1_score(true_labels, out_labels),
    }
    logger.info("***** Eval results *****")
    for key in sorted(results.keys()):
        logger.info("  %s = %s", key, str(results[key]))

    return results
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        return {
            "accuracy_score": accuracy_score(true_labels, true_predictions),
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions),
        }
Пример #22
0
    def get_metrics(pred_list: list, label_list: list) -> dict:
        """
        获取序列标注的各项评估指标
        :param pred_list:  list 模型预测的标签序列
        :param label_list: list 人为标注的标签序列
        :return: dict 包含各类评估指标的字典
        """
        if len(pred_list) == 0:
            return {
                "accuracy": 0, "precision": 0, "recall": 0, "f1": 0
            }

        return {
            "accuracy": "{:.5f}".format(metrics.accuracy_score(label_list, pred_list)),
            "precision": "{:.5f}".format(metrics.precision_score(label_list, pred_list)),
            "recall": "{:.5f}".format(metrics.recall_score(label_list, pred_list)),
            "f1": "{:.5f}".format(metrics.f1_score(label_list, pred_list))
        }
Пример #23
0
def compute_metrics(predictions, label_ids, label_map):
    preds = np.argmax(predictions, axis=-1)
    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                out_label_list[i].append(label_map[label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    return {
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }
Пример #24
0
    def compute(self, p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            self.id2tag[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            self.id2tag[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        return {
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions),
            "accuracy": accuracy_score(true_labels, true_predictions),
        }
Пример #25
0
def evaluate_nn(gold_labels, predictions, id2label):
    """

    Parameters
    ----------
    gold_labels : numpy.ndarray
        Predicted labels for all tokens (or sentences if seq2label)
    predictions : numpy.ndarray
        Respective gold labels for all tokens (or sentences if seq2label)
    id2label : dict
        Dictionary that maps from ids to real labels.

    Returns
    -------
    acc : float
        Accuracy of predicted labels.
    f1_e : float
        F! Score of predicated labels, adapted for BIO scheme (seqeval module)

    """

    acc = (predictions == gold_labels).sum() / len(predictions)

    # token level
    #precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
    #                              gold_labels, predictions, average='micro')

    # seqeval needs IOB2 labels, convert it
    preds = [id2label[x] for x in predictions]
    golds = [id2label[x] for x in gold_labels]

    # entity level
    precision_e = precision_score(golds, preds)
    recall_e = recall_score(golds, preds)
    f1_e = f1_score(golds, preds)

    print('         \t Accuracy: {:.3f}'.format(acc))
    #print('  sklearn   \t Prec: {:.2f} \t Recall: {:.2f} \t F1: {:.2f}'.format(
    #    precision, recall, f1))
    print(
        '         \t Precision: {:.2f} \t Recall: {:.2f} \t F1: {:.2f}'.format(
            precision_e, recall_e, f1_e))

    return acc, f1_e
Пример #26
0
    def xzk_eval_model(self, dataloader, name=None):
        with torch.no_grad():
            metrics = np.asarray([0, 0, 0], dtype=int)
            all_true_y_label = list()
            all_pred_y_label = list()
            print("testing")
            for batch, data in tqdm(enumerate(dataloader)):
                insts = data[-1]
                data = [x.to(self.device) for x in data[0:-1]]
                token_id_seq, data_length, char_seq_tensor, char_seq_len, masks, label_seq = data
                sequence_loss, logits = self.model(token_id_seq, data_length,
                                                   char_seq_tensor,
                                                   char_seq_len, masks,
                                                   label_seq)
                batch_max_scores, pred_ids = self.model.decode(
                    logits, data_length)
                metrics += evaluate_batch_insts(insts, pred_ids, label_seq,
                                                data_length,
                                                self.config.idx2labels,
                                                self.config.use_crf_layer)

                for i in insts:
                    all_pred_y_label.append(i.prediction)
                    all_true_y_label.append(i.output)

            p, total_predict, total_entity = metrics[0], metrics[1], metrics[2]
            precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0
            recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0
            fscore = 2.0 * precision * recall / (
                precision + recall) if precision != 0 or recall != 0 else 0
            print("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
                  (name, precision, recall, fscore),
                  flush=True)

            p = precision_score(all_true_y_label, all_pred_y_label)
            r = recall_score(all_true_y_label, all_pred_y_label)
            f1 = f1_score(all_true_y_label, all_pred_y_label)
            print("Precision: %.2f, Recall: %.2f, F1: %.2f" % (p, r, f1),
                  flush=True)
            print('acc',
                  accuracy_score(all_true_y_label, all_pred_y_label),
                  flush=True)
            print(classification_report(all_true_y_label, all_pred_y_label))
        return precision, recall, fscore
Пример #27
0
def student_metrics(predictions, label_ids, student_to_teacher_map):
    label_map = {i: label for i, label in enumerate(['B', 'I', 'O'])}
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]
    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != -100:
                out_label_list[i].append(label_map[label_ids[i][j]])
                preds_list[i].append(label_map[student_to_teacher_map[preds[i][j]]])

    metrics = {
        "accuracy_score": accuracy_score(out_label_list, preds_list),
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }
    return metrics
Пример #28
0
def f1_pre_rec(labels, preds, is_ner=True):
    if is_ner:
        return {
            "precision":
            seqeval_metrics.precision_score(labels, preds, suffix=True),
            "recall":
            seqeval_metrics.recall_score(labels, preds, suffix=True),
            "f1":
            seqeval_metrics.f1_score(labels, preds, suffix=True),
        }
    else:
        return {
            "precision":
            sklearn_metrics.precision_score(labels, preds, average="macro"),
            "recall":
            sklearn_metrics.recall_score(labels, preds, average="macro"),
            "f1":
            sklearn_metrics.f1_score(labels, preds, average="macro"),
        }
Пример #29
0
    def _compute(self, predictions, references, suffix=False):
        true_entities = set(get_entities(references, suffix))
        pred_entities = set(get_entities(predictions, suffix))
        d1 = defaultdict(set)
        d2 = defaultdict(set)
        scores = {}

        for e in true_entities:
            d1[e[0]].add((e[1], e[2]))

        for e in pred_entities:
            d2[e[0]].add((e[1], e[2]))

        for type_name, true_entities in d1.items():
            scores[type_name] = {}
            pred_entities = d2[type_name]
            nb_correct = len(true_entities & pred_entities)
            nb_pred = len(pred_entities)
            nb_true = len(true_entities)

            p = nb_correct / nb_pred if nb_pred > 0 else 0
            r = nb_correct / nb_true if nb_true > 0 else 0
            f1 = 2 * p * r / (p + r) if p + r > 0 else 0

            scores[type_name]["precision"] = p
            scores[type_name]["recall"] = r
            scores[type_name]["f1"] = f1
            scores[type_name]["number"] = nb_true

        scores["overall_precision"] = precision_score(y_true=references,
                                                      y_pred=predictions,
                                                      suffix=suffix)
        scores["overall_recall"] = recall_score(y_true=references,
                                                y_pred=predictions,
                                                suffix=suffix)
        scores["overall_f1"] = f1_score(y_true=references,
                                        y_pred=predictions,
                                        suffix=suffix)
        scores["overall_accuracy"] = accuracy_score(y_true=references,
                                                    y_pred=predictions)

        return scores
Пример #30
0
    def __test(self, model: PreTrainedModel,
               data: DataLoader) -> (float, float, float, float, float, str):
        eval_loss = 0.
        eval_steps, eval_examples = 0, 0
        tokens, eval_predictions, eval_labels = [], [], []
        model.eval()
        for batch in tqdm(data):
            batch_tokens, batch_masks, batch_tags = tuple(
                t.to(self.device) for t in batch)
            with torch.no_grad():
                outputs = model(batch_tokens,
                                attention_mask=batch_masks,
                                labels=batch_tags)
            logits = outputs[1].detach().cpu().numpy()
            label_ids = batch_tags.to('cpu').numpy()
            toks = batch_tokens.to('cpu').numpy()

            eval_loss += outputs[0].mean().item()
            batch_toks = [
                self.tokenizer.convert_ids_to_tokens(sentence)
                for sentence in toks
            ]
            tokens.extend(batch_toks)
            eval_predictions.extend(
                [list(p) for p in np.argmax(logits, axis=2)])
            eval_labels.extend(label_ids)

            eval_examples += batch_tokens.size(0)
            eval_steps += 1

        eval_loss = eval_loss / eval_steps

        predicted_tags, valid_tags, tokens = self.translate(
            eval_predictions, eval_labels, tokens)

        score_acc = accuracy_score(valid_tags, predicted_tags)
        score_f1 = f1_score(valid_tags, predicted_tags)
        score_p = precision_score(valid_tags, predicted_tags)
        score_r = recall_score(valid_tags, predicted_tags)
        report = classification_report(valid_tags, predicted_tags)

        return eval_loss, score_acc, score_f1, score_p, score_r, report