Exemplo n.º 1
0
    def _evaluate_with_span_F1(self, data_loader, embedding_storage_mode,
                               mini_batch_size, out_path):
        eval_loss = 0

        batch_no: int = 0

        metric = Metric("Evaluation", beta=self.beta)

        lines: List[str] = []

        y_true = []
        y_pred = []

        for batch in data_loader:

            # predict for batch
            loss = self.predict(batch,
                                embedding_storage_mode=embedding_storage_mode,
                                mini_batch_size=mini_batch_size,
                                label_name='predicted',
                                return_loss=True)
            eval_loss += loss
            batch_no += 1

            for sentence in batch:

                # make list of gold tags
                gold_spans = sentence.get_spans(self.tag_type)
                gold_tags = [(span.tag, repr(span)) for span in gold_spans]

                # make list of predicted tags
                predicted_spans = sentence.get_spans("predicted")
                predicted_tags = [(span.tag, repr(span))
                                  for span in predicted_spans]

                # check for true positives, false positives and false negatives
                for tag, prediction in predicted_tags:
                    if (tag, prediction) in gold_tags:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag, gold in gold_tags:
                    if (tag, gold) not in predicted_tags:
                        metric.add_fn(tag)

                tags_gold = []
                tags_pred = []

                # also write to file in BIO format to use old conlleval script
                if out_path:
                    for token in sentence:
                        # check if in gold spans
                        gold_tag = 'O'
                        for span in gold_spans:
                            if token in span:
                                gold_tag = 'B-' + span.tag if token == span[
                                    0] else 'I-' + span.tag
                        tags_gold.append(gold_tag)

                        predicted_tag = 'O'
                        # check if in predicted spans
                        for span in predicted_spans:
                            if token in span:
                                predicted_tag = 'B-' + span.tag if token == span[
                                    0] else 'I-' + span.tag
                        tags_pred.append(predicted_tag)

                        lines.append(
                            f'{token.text} {gold_tag} {predicted_tag}\n')
                    lines.append('\n')

                y_true.append(tags_gold)
                y_pred.append(tags_pred)

        if out_path:
            with open(Path(out_path), "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        eval_loss /= batch_no

        detailed_result = (
            "\nResults:"
            f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}"
            f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}"
            '\n\nBy class:')

        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss
Exemplo n.º 2
0
    def evaluate(self,
                 evaluation: List[Sentence],
                 out_path=None,
                 evaluation_method: str = 'F1',
                 eval_batch_size: int = 32,
                 embeddings_in_memory: bool = True):

        batch_no: int = 0
        batches = [
            evaluation[x:x + eval_batch_size]
            for x in range(0, len(evaluation), eval_batch_size)
        ]

        metric = Metric('')

        lines: List[str] = []

        for batch in batches:
            batch_no += 1

            scores, tag_seq = self.model._predict_scores_batch(batch)
            predicted_ids = tag_seq
            all_tokens = []
            for sentence in batch:
                all_tokens.extend(sentence.tokens)

            for (token, score, predicted_id) in zip(all_tokens, scores,
                                                    predicted_ids):
                token: Token = token
                # get the predicted tag
                predicted_value = self.model.tag_dictionary.get_item_for_index(
                    predicted_id)
                token.add_tag('predicted', predicted_value, score)

            for sentence in batch:

                # add predicted tags
                for token in sentence.tokens:
                    predicted_tag: Label = token.get_tag('predicted')

                    # append both to file for evaluation
                    eval_line = '{} {} {}\n'.format(
                        token.text,
                        token.get_tag(self.model.tag_type).value,
                        predicted_tag.value)

                    lines.append(eval_line)
                lines.append('\n')

                # make list of gold tags
                gold_tags = [
                    str(tag) for tag in sentence.get_spans(self.model.tag_type)
                ]

                # make list of predicted tags
                predicted_tags = [
                    str(tag) for tag in sentence.get_spans('predicted')
                ]

                # check for true positives, false positives and false negatives
                for prediction in predicted_tags:
                    if prediction in gold_tags:
                        metric.tp()
                    else:
                        metric.fp()

                for gold in gold_tags:
                    if gold not in predicted_tags:
                        metric.fn()

            if not embeddings_in_memory:
                self.clear_embeddings_in_batch(batch)

        if out_path is not None:
            test_tsv = os.path.join(out_path, "test.tsv")
            with open(test_tsv, "w", encoding='utf-8') as outfile:
                outfile.write(''.join(lines))

        if evaluation_method == 'accuracy':
            score = metric.accuracy()
            return score, metric

        if evaluation_method == 'F1':
            score = metric.f_score()
            return score, metric
Exemplo n.º 3
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embedding_storage_mode: str = "none",
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            metric = Metric("Evaluation")

            lines: List[str] = []
            batch_count: int = 0
            for batch in data_loader:

                batch_count += 1

                labels, loss = self.forward_labels_and_loss(batch)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = self.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch,
                        confidences_for_batch,
                        predictions_for_batch,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\t{}\n".format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):

                    for label in available_labels:
                        if (label in predictions_for_sentence
                                and label in true_values_for_sentence):
                            metric.add_tp(label)
                        elif (label in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_fp(label)
                        elif (label not in predictions_for_sentence
                              and label in true_values_for_sentence):
                            metric.add_fn(label)
                        elif (label not in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_tn(label)

                store_embeddings(batch, embedding_storage_mode)

            eval_loss /= batch_count

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            return result, eval_loss
Exemplo n.º 4
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         metric = Metric('Evaluation')
         lines = []
         batch_count = 0
         for batch in data_loader:
             batch_count += 1
             (labels, loss) = self.forward_labels_and_loss(batch)
             eval_loss += loss
             sentences_for_batch = [
                 sent.to_plain_string() for sent in batch
             ]
             confidences_for_batch = [[
                 label.score for label in sent_labels
             ] for sent_labels in labels]
             predictions_for_batch = [[
                 label.value for label in sent_labels
             ] for sent_labels in labels]
             true_values_for_batch = [
                 sentence.get_label_names() for sentence in batch
             ]
             available_labels = self.label_dictionary.get_items()
             for (sentence, confidence, prediction, true_value) in zip(
                     sentences_for_batch, confidences_for_batch,
                     predictions_for_batch, true_values_for_batch):
                 eval_line = '{}\t{}\t{}\t{}\n'.format(
                     sentence, true_value, prediction, confidence)
                 lines.append(eval_line)
             for (predictions_for_sentence,
                  true_values_for_sentence) in zip(predictions_for_batch,
                                                   true_values_for_batch):
                 for label in available_labels:
                     if ((label in predictions_for_sentence)
                             and (label in true_values_for_sentence)):
                         metric.add_tp(label)
                     elif ((label in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_fp(label)
                     elif ((label not in predictions_for_sentence)
                           and (label in true_values_for_sentence)):
                         metric.add_fn(label)
                     elif ((label not in predictions_for_sentence)
                           and (label not in true_values_for_sentence)):
                         metric.add_tn(label)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_count
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         return (result, eval_loss)
Exemplo n.º 5
0
    def evaluate(
        self,
        sentences: Dataset,
        eval_mini_batch_size: int = 32,
        embeddings_in_memory: bool = True,
        out_path: Path = None,
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            batch_loader = torch.utils.data.DataLoader(
                sentences,
                batch_size=eval_mini_batch_size,
                shuffle=False,
                num_workers=4,
                collate_fn=list,
            )

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in batch_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self._calculate_loss(features, batch)
                    tags, _ = self._obtain_labels(features, batch)

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag_label("predicted", tag)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)
                    ]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, str(tag)) for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory
                )

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}"
                )

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
Exemplo n.º 6
0
    def train(self,
              base_path: str,
              learning_rate: float = 0.1,
              mini_batch_size: int = 32,
              max_epochs: int = 50,
              anneal_factor: float = 0.5,
              patience: int = 5,
              save_model: bool = True,
              embeddings_in_memory: bool = False,
              train_with_dev: bool = False,
              eval_on_train: bool = True):
        """
        Trains the model using the training data of the corpus.
        :param patience: number of 'bad' epochs before learning rate gets decreased
        :param anneal_factor: learning rate will be decreased by this factor
        :param base_path: the directory to which any results should be written to
        :param learning_rate: the learning rate
        :param mini_batch_size: the mini batch size
        :param max_epochs: the maximum number of epochs to train
        :param save_model: boolean value indicating, whether the model should be saved or not
        :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not
        :param train_with_dev: boolean value indicating, if the dev data set should be used for training or not
        :param eval_on_train: boolean value indicating, if evaluation metrics should be calculated on training data set
        or not
        """

        loss_txt = init_output_file(base_path, 'loss.tsv')
        with open(loss_txt, 'a') as f:
            f.write(
                'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n'
                .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'),
                        Metric.tsv_header('TEST')))

        weight_extractor = WeightExtractor(base_path)

        optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate)

        anneal_mode = 'min' if train_with_dev else 'max'
        scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer,
                                                         factor=anneal_factor,
                                                         patience=patience,
                                                         mode=anneal_mode)

        train_data = self.corpus.train
        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data.extend(self.corpus.dev)

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            # record overall best dev scores and best loss
            best_score = 0

            for epoch in range(max_epochs):
                log.info('-' * 100)

                if not self.test_mode:
                    random.shuffle(train_data)

                self.model.train()

                batches = [
                    self.corpus.train[x:x + mini_batch_size]
                    for x in range(0, len(self.corpus.train), mini_batch_size)
                ]

                current_loss: float = 0
                seen_sentences = 0
                modulo = max(1, int(len(batches) / 10))

                for group in optimizer.param_groups:
                    learning_rate = group['lr']

                for batch_no, batch in enumerate(batches):
                    scores = self.model.forward(batch)
                    loss = self.model.calculate_loss(scores, batch)

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    seen_sentences += len(batch)
                    current_loss += loss.item()

                    clear_embeddings(
                        batch,
                        also_clear_word_embeddings=not embeddings_in_memory)

                    if batch_no % modulo == 0:
                        log.info(
                            "epoch {0} - iter {1}/{2} - loss {3:.8f}".format(
                                epoch + 1, batch_no, len(batches),
                                current_loss / seen_sentences))
                        iteration = epoch * len(batches) + batch_no
                        weight_extractor.extract_weights(
                            self.model.state_dict(), iteration)

                current_loss /= len(train_data)

                self.model.eval()

                log.info('-' * 100)
                log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format(
                    epoch + 1, learning_rate, scheduler.num_bad_epochs))

                dev_metric = train_metric = None
                dev_loss = '_'
                train_loss = current_loss

                if eval_on_train:
                    train_metric, train_loss = self._calculate_evaluation_results_for(
                        'TRAIN', self.corpus.train, embeddings_in_memory,
                        mini_batch_size)

                if not train_with_dev:
                    dev_metric, dev_loss = self._calculate_evaluation_results_for(
                        'DEV', self.corpus.dev, embeddings_in_memory,
                        mini_batch_size)

                with open(loss_txt, 'a') as f:
                    train_metric_str = train_metric.to_tsv(
                    ) if train_metric is not None else Metric.to_empty_tsv()
                    dev_metric_str = dev_metric.to_tsv(
                    ) if dev_metric is not None else Metric.to_empty_tsv()
                    f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        epoch, datetime.datetime.now(), train_loss,
                        train_metric_str, dev_loss, dev_metric_str, '_',
                        Metric.to_empty_tsv()))

                # anneal against train loss if training with dev, otherwise anneal against dev score
                scheduler.step(
                    current_loss) if train_with_dev else scheduler.step(
                        dev_metric.f_score())

                is_best_model_so_far: bool = False
                current_score = dev_metric.f_score(
                ) if not train_with_dev else train_metric.f_score()

                if current_score >= best_score:
                    best_score = current_score
                    is_best_model_so_far = True

                if is_best_model_so_far:
                    if save_model:
                        self.model.save(base_path + "/model.pt")

            self.model.save(base_path + "/final-model.pt")

            if save_model:
                self.model = TextClassifier.load_from_file(base_path +
                                                           "/model.pt")

            log.info('-' * 100)
            log.info('Testing using best model ...')

            self.model.eval()
            test_metrics, test_loss = self.evaluate(
                self.corpus.test,
                mini_batch_size=mini_batch_size,
                eval_class_metrics=True,
                embeddings_in_memory=embeddings_in_memory)

            for metric in test_metrics.values():
                metric.print()
            self.model.train()

            log.info('-' * 100)

        except KeyboardInterrupt:
            log.info('-' * 100)
            log.info('Exiting from training early.')
            log.info('Saving model ...')
            with open(base_path + "/final-model.pt", 'wb') as model_save_file:
                torch.save(self.model, model_save_file, pickle_protocol=4)
                model_save_file.close()
            log.info('Done.')
Exemplo n.º 7
0
def test_metric_with_classes():
    metric = Metric('Test')

    metric.add_tp('class-1')
    metric.add_tn('class-1')
    metric.add_tn('class-1')
    metric.add_fp('class-1')

    metric.add_tp('class-2')
    metric.add_tn('class-2')
    metric.add_tn('class-2')
    metric.add_fp('class-2')

    for i in range(0, 10):
        metric.add_tp('class-3')
    for i in range(0, 90):
        metric.add_fp('class-3')

    metric.add_tp('class-4')
    metric.add_tn('class-4')
    metric.add_tn('class-4')
    metric.add_fp('class-4')

    assert(metric.precision('class-1') == 0.5)
    assert(metric.precision('class-2') == 0.5)
    assert(metric.precision('class-3') == 0.1)
    assert(metric.precision('class-4') == 0.5)

    assert(metric.recall('class-1') == 1)
    assert(metric.recall('class-2') == 1)
    assert(metric.recall('class-3') == 1)
    assert(metric.recall('class-4') == 1)

    assert(metric.accuracy() == metric.micro_avg_accuracy())
    assert(metric.f_score() == metric.micro_avg_f_score())

    assert(metric.f_score('class-1') == 0.6667)
    assert(metric.f_score('class-2') == 0.6667)
    assert(metric.f_score('class-3') == 0.1818)
    assert(metric.f_score('class-4') == 0.6667)

    assert(metric.accuracy('class-1') == 0.75)
    assert(metric.accuracy('class-2') == 0.75)
    assert(metric.accuracy('class-3') == 0.1)
    assert(metric.accuracy('class-4') == 0.75)

    assert(metric.micro_avg_f_score() == 0.2184)
    assert(metric.macro_avg_f_score() == 0.5714)

    assert(metric.micro_avg_accuracy() == 0.1696)
    assert(metric.macro_avg_accuracy() == 0.5875)

    assert(metric.precision() == 0.1226)
    assert(metric.recall() == 1)
Exemplo n.º 8
0
 def evaluate(self,
              data_loader: DataLoader,
              out_path: Path = None,
              embeddings_storage_mode: str = 'cpu') -> (Result, float):
     with torch.no_grad():
         eval_loss = 0
         batch_no = 0
         metric = Metric('Evaluation')
         lines = []
         for batch in data_loader:
             batch_no += 1
             with torch.no_grad():
                 features = self.forward(batch)
                 loss = self._calculate_loss(features, batch)
                 (tags, _) = self._obtain_labels(features, batch)
             eval_loss += loss
             for (sentence, sent_tags) in zip(batch, tags):
                 for (token, tag) in zip(sentence.tokens, sent_tags):
                     token = token
                     token.add_tag_label('predicted', tag)
                     eval_line = '{} {} {} {}\n'.format(
                         token.text,
                         token.get_tag(self.tag_type).value, tag.value,
                         tag.score)
                     lines.append(eval_line)
                 lines.append('\n')
             for sentence in batch:
                 gold_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans(self.tag_type)]
                 predicted_tags = [
                     (tag.tag, str(tag))
                     for tag in sentence.get_spans('predicted')
                 ]
                 for (tag, prediction) in predicted_tags:
                     if ((tag, prediction) in gold_tags):
                         metric.add_tp(tag)
                     else:
                         metric.add_fp(tag)
                 for (tag, gold) in gold_tags:
                     if ((tag, gold) not in predicted_tags):
                         metric.add_fn(tag)
                     else:
                         metric.add_tn(tag)
             store_embeddings(batch, embeddings_storage_mode)
         eval_loss /= batch_no
         if (out_path is not None):
             with open(out_path, 'w', encoding='utf-8') as outfile:
                 outfile.write(''.join(lines))
         detailed_result = ''.join([
             '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.micro_avg_f_score()),
             '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()),
             ' - f1-score ', '{}'.format(metric.macro_avg_f_score())
         ])
         for class_name in metric.get_classes():
             detailed_result += ''.join([
                 '\n', '{:<10}'.format(class_name), ' tp: ',
                 '{}'.format(metric.get_tp(class_name)), ' - fp: ',
                 '{}'.format(metric.get_fp(class_name)), ' - fn: ',
                 '{}'.format(metric.get_fn(class_name)), ' - tn: ',
                 '{}'.format(metric.get_tn(class_name)), ' - precision: ',
                 '{:.4f}'.format(metric.precision(class_name)),
                 ' - recall: ', '{:.4f}'.format(metric.recall(class_name)),
                 ' - accuracy: ', '{:.4f}'.format(
                     metric.accuracy(class_name)), ' - f1-score: ',
                 '{:.4f}'.format(metric.f_score(class_name))
             ])
         result = Result(main_score=metric.micro_avg_f_score(),
                         log_line=''.join([
                             '{}'.format(metric.precision()), '\t',
                             '{}'.format(metric.recall()), '\t',
                             '{}'.format(metric.micro_avg_f_score())
                         ]),
                         log_header='PRECISION\tRECALL\tF1',
                         detailed_results=detailed_result)
         return (result, eval_loss)
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embedding_storage_mode: str = "none",
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)
        metric = Metric("Evaluation", beta=self.beta)
        parsing_metric = ParsingMetric()

        lines: List[str] = []

        eval_loss_arc = 0
        eval_loss_rel = 0

        for batch_idx, batch in enumerate(data_loader):

            with torch.no_grad():
                score_arc, score_rel = self.forward(batch)
                loss_arc, loss_rel = self._calculate_loss(
                    score_arc, score_rel, batch)
                arc_prediction, relation_prediction = self._obtain_labels_(
                    score_arc, score_rel)

            parsing_metric(arc_prediction, relation_prediction, batch)

            eval_loss_arc += loss_arc
            eval_loss_rel += loss_rel

            for (sentence, arcs, sent_tags) in zip(batch, arc_prediction,
                                                   relation_prediction):
                for (token, arc, tag) in zip(sentence.tokens, arcs, sent_tags):
                    token: Token = token
                    token.add_tag_label("predicted", Label(tag))
                    token.add_tag_label("predicted_head_id", Label(str(arc)))

                    # append both to file for evaluation
                    eval_line = "{} {} {} {} {}\n".format(
                        token.text,
                        token.tags['dependency'].value,
                        str(token.head_id),
                        tag,
                        str(arc),
                    )
                    lines.append(eval_line)
                lines.append("\n")

            for sentence in batch:

                # make list of gold tags
                gold_tags = [
                    token.tags['dependency'].value for token in sentence.tokens
                ]

                # make list of predicted tags
                predicted_tags = [
                    tag.tag for tag in sentence.get_spans("predicted")
                ]

                # check for true positives, false positives and false negatives
                for tag_indx, predicted_tag in enumerate(predicted_tags):
                    if predicted_tag == gold_tags[tag_indx]:
                        metric.add_tp(tag)
                    else:
                        metric.add_fp(tag)

                for tag_indx, label_tag in enumerate(gold_tags):
                    if label_tag != predicted_tags[tag_indx]:
                        metric.add_fn(tag)
                    else:
                        metric.add_tn(tag)
            store_embeddings(batch, embedding_storage_mode)

        eval_loss_arc /= len(data_loader)
        eval_loss_rel /= len(data_loader)

        if out_path is not None:
            with open(out_path, "w", encoding="utf-8") as outfile:
                outfile.write("".join(lines))

        detailed_result = (
            f"\nUAS : {parsing_metric.get_uas():.4f} - LAS : {parsing_metric.get_las():.4f}"
            f"\neval loss rel : {eval_loss_rel:.4f} - eval loss arc : {eval_loss_arc:.4f}"
            f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
            f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
        )
        for class_name in metric.get_classes():
            detailed_result += (
                f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                f"{metric.f_score(class_name):.4f}")

        result = Result(
            main_score=metric.micro_avg_f_score(),
            log_line=
            f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
            log_header="PRECISION\tRECALL\tF1",
            detailed_results=detailed_result,
        )

        return result, eval_loss_arc + eval_loss_rel
Exemplo n.º 10
0
def eval_flair_spans(data, predicted_list, batch_size, out_path=None):
    metric = Metric('Evaluation')

    mini_batch_size = batch_size
    batches = [
        data[x:x + mini_batch_size]
        for x in range(0, len(data), mini_batch_size)
    ]

    lines: List[str] = []
    word_counter = 0
    for batch in batches:
        for sentence in batch:
            for token in sentence.tokens:
                tag = Label(predicted_list[word_counter])
                word_counter += 1
                token.add_tag_label('predicted', tag)

                # append both to file for evaluation
                eval_line = '{} {} {} {}\n'.format(token.text,
                                                   token.get_tag('ner').value,
                                                   tag.value, tag.score)

                lines.append(eval_line)
            lines.append('\n')

        for sentence in batch:
            # make list of gold tags
            gold_tags = [(tag.tag, str(tag))
                         for tag in sentence.get_spans('ner')]
            # make list of predicted tags
            predicted_tags = [(tag.tag, str(tag))
                              for tag in sentence.get_spans('predicted')]

            # check for true positives, false positives and false negatives
            for tag, prediction in predicted_tags:
                if (tag, prediction) in gold_tags:
                    metric.add_tp(tag)
                else:
                    metric.add_fp(tag)

            for tag, gold in gold_tags:
                if (tag, gold) not in predicted_tags:
                    metric.add_fn(tag)
                else:
                    metric.add_tn(tag)

    # add metrics scores at the beginning of the file
    lines.insert(0, str(metric) + "\n\n")

    if out_path is not None:

        # create folder for json and corresponding output
        if not os.path.exists(os.path.dirname(out_path)):
            try:
                os.makedirs(os.path.dirname(out_path))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        with open(out_path, "w", encoding='utf-8') as outfile:
            outfile.write(''.join(lines))
        #
    # esnWrapper.model.output_activation = output_activation_training
    return metric
Exemplo n.º 11
0
    def _evaluate_text_classifier(model,
                                  sentences,
                                  eval_mini_batch_size=32,
                                  embeddings_in_memory=False,
                                  out_path=None):

        with torch.no_grad():

            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric('Evaluation')

            lines = []

            for batch in batches:

                labels, loss = model.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]

                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]

                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]

                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]

                available_labels = model.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch, confidences_for_batch,
                        predictions_for_batch, true_values_for_batch):

                    eval_line = '{}\t{}\t{}\t{}\n'.format(
                        sentence, true_value, prediction, confidence)

                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):

                    ModelTrainer._evaluate_sentence_for_text_classification(
                        metric, available_labels, predictions_for_sentence,
                        true_values_for_sentence)

            eval_loss /= len(sentences)

            if out_path is not None:

                with open(out_path, "w", encoding='utf-8') as outfile:

                    outfile.write(''.join(lines))

            return metric, eval_loss
Exemplo n.º 12
0
    def _evaluate_sequence_tagger(model,
                                  sentences,
                                  eval_mini_batch_size=32,
                                  embeddings_in_memory=True,
                                  out_path=None):

        with torch.no_grad():

            eval_loss = 0

            batch_no = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric('Evaluation')

            lines = []

            for batch in batches:

                batch_no += 1

                tags, loss = model.forward_labels_and_loss(batch)

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):

                    for (token, tag) in zip(sentence.tokens, sent_tags):

                        token = token

                        token.add_tag_label('predicted', tag)

                        # append both to file for evaluation

                        eval_line = '{} {} {} {}\n'.format(
                            token.text,
                            token.get_tag(model.tag_type).value, tag.value,
                            tag.score)

                        lines.append(eval_line)

                    lines.append('\n')

                for sentence in batch:

                    # make list of gold tags

                    gold_tags = [(tag.tag, str(tag))
                                 for tag in sentence.get_spans(model.tag_type)]

                    # make list of predicted tags

                    predicted_tags = [
                        (tag.tag, str(tag))
                        for tag in sentence.get_spans('predicted')
                    ]

                    # check for true positives, false positives and false negatives

                    for tag, prediction in predicted_tags:

                        if (tag, prediction) in gold_tags:

                            metric.add_tp(tag)

                        else:

                            metric.add_fp(tag)

                    for tag, gold in gold_tags:

                        if (tag, gold) not in predicted_tags:

                            metric.add_fn(tag)

                        else:

                            metric.add_tn(tag)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

            eval_loss /= len(sentences)

            if out_path is not None:

                with open(out_path, "w", encoding='utf-8') as outfile:

                    outfile.write(''.join(lines))

            return metric, eval_loss
Exemplo n.º 13
0
    def train(self,
              base_path,
              evaluation_metric=EvaluationMetric.MICRO_F1_SCORE,
              learning_rate=0.1,
              mini_batch_size=32,
              eval_mini_batch_size=None,
              max_epochs=100,
              anneal_factor=0.5,
              patience=3,
              anneal_against_train_loss=True,
              train_with_dev=False,
              monitor_train=False,
              embeddings_in_memory=True,
              checkpoint=False,
              save_final_model=True,
              anneal_with_restarts=False,
              test_mode=False,
              param_selection_mode=False,
              **kwargs):

        if eval_mini_batch_size is None:

            eval_mini_batch_size = mini_batch_size

        # cast string to Path

        if type(base_path) is str:

            base_path = Path(base_path)

        add_file_handler(log, base_path / 'training.log')

        log_line(log)

        log.info(f'Evaluation method: {evaluation_metric.name}')

        if not param_selection_mode:

            loss_txt = init_output_file(base_path, 'loss.tsv')

            with open(loss_txt, 'a') as f:

                f.write(
                    f'EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS\t{Metric.tsv_header("TRAIN")}\tDEV_LOSS\t{Metric.tsv_header("DEV")}'
                    f'\tTEST_LOSS\t{Metric.tsv_header("TEST")}\n')

            weight_extractor = WeightExtractor(base_path)

        optimizer = self.optimizer(self.model.parameters(),
                                   lr=learning_rate,
                                   **kwargs)

        if self.optimizer_state is not None:

            optimizer.load_state_dict(self.optimizer_state)

        # annealing scheduler

        anneal_mode = 'min' if anneal_against_train_loss else 'max'

        if isinstance(optimizer, (AdamW, SGDW)):

            scheduler = ReduceLRWDOnPlateau(optimizer,
                                            factor=anneal_factor,
                                            patience=patience,
                                            mode=anneal_mode,
                                            verbose=True)

        else:

            scheduler = ReduceLROnPlateau(optimizer,
                                          factor=anneal_factor,
                                          patience=patience,
                                          mode=anneal_mode,
                                          verbose=True)

        if self.scheduler_state is not None:

            scheduler.load_state_dict(self.scheduler_state)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set

        if train_with_dev:

            train_data.extend(self.corpus.dev)

        dev_score_history = []

        dev_loss_history = []

        train_loss_history = []

        # At any point you can hit Ctrl + C to break out of training early.

        try:

            previous_learning_rate = learning_rate

            for epoch in range(0 + self.epoch, max_epochs + self.epoch):

                log_line(log)

                try:

                    bad_epochs = scheduler.num_bad_epochs

                except:

                    bad_epochs = 0

                for group in optimizer.param_groups:

                    learning_rate = group['lr']

                # reload last best model if annealing with restarts is enabled

                if learning_rate != previous_learning_rate and anneal_with_restarts and\
                        (base_path / 'best-model.pt').exists():

                    log.info('resetting to best model')

                    self.model.load_from_file(base_path / 'best-model.pt')

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small

                if learning_rate < 0.0001:

                    log_line(log)

                    log.info('learning rate too small - quitting training!')

                    log_line(log)

                    break

                if not test_mode:

                    random.shuffle(train_data)

                batches = [
                    train_data[x:x + mini_batch_size]
                    for x in range(0, len(train_data), mini_batch_size)
                ]

                self.model.train()

                train_loss = 0

                seen_sentences = 0

                modulo = max(1, int(len(batches) / 10))

                for batch_no, batch in enumerate(batches):

                    loss = self.model.forward_loss(batch)

                    optimizer.zero_grad()

                    loss.backward()

                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)

                    optimizer.step()

                    seen_sentences += len(batch)

                    train_loss += loss.item()

                    clear_embeddings(
                        batch,
                        also_clear_word_embeddings=not embeddings_in_memory)

                    if batch_no % modulo == 0:

                        log.info(
                            f'epoch {epoch + 1} - iter {batch_no}/{len(batches)} - loss '
                            f'{train_loss / seen_sentences:.8f}')

                        iteration = epoch * len(batches) + batch_no

                        if not param_selection_mode:

                            weight_extractor.extract_weights(
                                self.model.state_dict(), iteration)

                train_loss /= len(train_data)

                self.model.eval()

                log_line(log)

                log.info(
                    f'EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}'
                )

                dev_metric = None

                dev_loss = '_'

                train_metric = None

                test_metric = None

                if monitor_train:

                    train_metric, train_loss = self._calculate_evaluation_results_for(
                        'TRAIN', self.corpus.train, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size)

                if not train_with_dev:

                    dev_metric, dev_loss = self._calculate_evaluation_results_for(
                        'DEV', self.corpus.dev, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size)

                if not param_selection_mode and self.corpus.test:

                    test_metric, test_loss = self._calculate_evaluation_results_for(
                        'TEST', self.corpus.test, evaluation_metric,
                        embeddings_in_memory, eval_mini_batch_size,
                        base_path / 'test.tsv')

                if not param_selection_mode:

                    with open(loss_txt, 'a') as f:

                        train_metric_str = train_metric.to_tsv(
                        ) if train_metric is not None else Metric.to_empty_tsv(
                        )

                        dev_metric_str = dev_metric.to_tsv(
                        ) if dev_metric is not None else Metric.to_empty_tsv()

                        test_metric_str = test_metric.to_tsv(
                        ) if test_metric is not None else Metric.to_empty_tsv(
                        )

                        f.write(
                            f'{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t'
                            f'{train_loss}\t{train_metric_str}\t{dev_loss}\t{dev_metric_str}\t_\t{test_metric_str}\n'
                        )

                # calculate scores using dev data if available

                dev_score = 0.

                if not train_with_dev:

                    if evaluation_metric == EvaluationMetric.MACRO_ACCURACY:

                        dev_score = dev_metric.macro_avg_accuracy()

                    elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY:

                        dev_score = dev_metric.micro_avg_accuracy()

                    elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE:

                        dev_score = dev_metric.macro_avg_f_score()

                    else:

                        dev_score = dev_metric.micro_avg_f_score()

                    # append dev score to score history

                    dev_score_history.append(dev_score)

                    dev_loss_history.append(dev_loss.item())

                # anneal against train loss if training with dev, otherwise anneal against dev score

                current_score = train_loss if anneal_against_train_loss else dev_score

                scheduler.step(current_score)

                train_loss_history.append(train_loss)

                # if checkpoint is enable, save model at each epoch

                if checkpoint and not param_selection_mode:

                    self.model.save_checkpoint(base_path / 'checkpoint.pt',
                                               optimizer.state_dict(),
                                               scheduler.state_dict(),
                                               epoch + 1, train_loss)

                # if we use dev data, remember best model based on dev evaluation score

                if not train_with_dev and not param_selection_mode and current_score == scheduler.best:

                    self.model.save(base_path / 'best-model.pt')

            # if we do not use dev data for model selection, save final model

            if save_final_model and not param_selection_mode:

                self.model.save(base_path / 'final-model.pt')

        except KeyboardInterrupt:

            log_line(log)

            log.info('Exiting from training early.')

            if not param_selection_mode:

                log.info('Saving model ...')

                self.model.save(base_path / 'final-model.pt')

                log.info('Done.')

        # test best model if test data is present

        if self.corpus.test:

            final_score = self.final_test(base_path, embeddings_in_memory,
                                          evaluation_metric,
                                          eval_mini_batch_size)

        else:

            final_score = 0

            log.info('Test data not provided setting final score to 0')

        return {
            'test_score': final_score,
            'dev_score_history': dev_score_history,
            'train_loss_history': train_loss_history,
            'dev_loss_history': dev_loss_history
        }
Exemplo n.º 14
0
    def train(
        self,
        base_path: str,
        learning_rate: float = 0.1,
        mini_batch_size: int = 32,
        max_epochs: int = 100,
        anneal_factor: float = 0.5,
        patience: int = 4,
        train_with_dev: bool = False,
        embeddings_in_memory: bool = True,
        checkpoint: bool = False,
        save_final_model: bool = True,
        anneal_with_restarts: bool = False,
    ):

        evaluation_method = 'F1'
        if self.model.tag_type in ['pos', 'upos']:
            evaluation_method = 'accuracy'
        log.info('Evaluation method: {}'.format(evaluation_method))

        loss_txt = init_output_file(base_path, 'loss.tsv')
        with open(loss_txt, 'a') as f:
            f.write(
                'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n'
                .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'),
                        Metric.tsv_header('TEST')))

        weight_extractor = WeightExtractor(base_path)

        optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate)

        # annealing scheduler
        anneal_mode = 'min' if train_with_dev else 'max'
        scheduler = ReduceLROnPlateau(optimizer,
                                      factor=anneal_factor,
                                      patience=patience,
                                      mode=anneal_mode,
                                      verbose=True)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data.extend(self.corpus.dev)

        # At any point you can hit Ctrl + C to break out of training early.
        try:

            previous_learning_rate = learning_rate

            for epoch in range(0, max_epochs):

                bad_epochs = scheduler.num_bad_epochs
                for group in optimizer.param_groups:
                    learning_rate = group['lr']

                # reload last best model if annealing with restarts is enabled
                if learning_rate != previous_learning_rate and anneal_with_restarts:
                    log.info('resetting to best model')
                    self.model.load_from_file(base_path + "/best-model.pt")

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < 0.001:
                    log.info('learning rate too small - quitting training!')
                    break

                if not self.test_mode: random.shuffle(train_data)

                batches = [
                    train_data[x:x + mini_batch_size]
                    for x in range(0, len(train_data), mini_batch_size)
                ]

                self.model.train()

                current_loss: float = 0
                seen_sentences = 0
                modulo = max(1, int(len(batches) / 10))

                for batch_no, batch in enumerate(batches):
                    batch: List[Sentence] = batch

                    optimizer.zero_grad()

                    # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step()
                    loss = self.model.neg_log_likelihood(batch)

                    current_loss += loss.item()
                    seen_sentences += len(batch)

                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    if not embeddings_in_memory:
                        self.clear_embeddings_in_batch(batch)

                    if batch_no % modulo == 0:
                        log.info(
                            "epoch {0} - iter {1}/{2} - loss {3:.8f}".format(
                                epoch + 1, batch_no, len(batches),
                                current_loss / seen_sentences))
                        iteration = epoch * len(batches) + batch_no
                        weight_extractor.extract_weights(
                            self.model.state_dict(), iteration)

                current_loss /= len(train_data)

                # switch to eval mode
                self.model.eval()

                # if checkpointing is enable, save model at each epoch
                if checkpoint:
                    self.model.save(base_path + "/checkpoint.pt")

                log.info('-' * 100)

                dev_score = dev_metric = None
                if not train_with_dev:
                    dev_score, dev_metric = self.evaluate(
                        self.corpus.dev,
                        base_path,
                        evaluation_method=evaluation_method,
                        embeddings_in_memory=embeddings_in_memory)

                test_score, test_metric = self.evaluate(
                    self.corpus.test,
                    base_path,
                    evaluation_method=evaluation_method,
                    embeddings_in_memory=embeddings_in_memory)

                # anneal against train loss if training with dev, otherwise anneal against dev score
                scheduler.step(
                    current_loss) if train_with_dev else scheduler.step(
                        dev_score)

                # logging info
                log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format(
                    epoch + 1, learning_rate, bad_epochs))
                if not train_with_dev:
                    log.info(
                        "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}"
                        .format('DEV', dev_metric.f_score(),
                                dev_metric.accuracy(), dev_metric._tp,
                                dev_metric._fp, dev_metric._fn,
                                dev_metric._tn))
                log.info(
                    "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}"
                    .format('TEST', test_metric.f_score(),
                            test_metric.accuracy(), test_metric._tp,
                            test_metric._fp, test_metric._fn, test_metric._tn))

                with open(loss_txt, 'a') as f:
                    dev_metric_str = dev_metric.to_tsv(
                    ) if dev_metric is not None else Metric.to_empty_tsv()
                    f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        epoch, datetime.datetime.now(), '_',
                        Metric.to_empty_tsv(), '_', dev_metric_str, '_',
                        test_metric.to_tsv()))

                # if we use dev data, remember best model based on dev evaluation score
                if not train_with_dev and dev_score == scheduler.best:
                    self.model.save(base_path + "/best-model.pt")

            # if we do not use dev data for model selection, save final model
            if save_final_model:
                if train_with_dev:
                    self.model.save(base_path + "/final-model.pt")

        except KeyboardInterrupt:
            log.info('-' * 100)
            log.info('Exiting from training early.')
            log.info('Saving model ...')
            self.model.save(base_path + "/final-model.pt")
            log.info('Done.')
Exemplo n.º 15
0
def test_metric_get_classes():
    metric = Metric('Test')

    metric.add_fn('class-1')
    metric.add_fn('class-3')
    metric.add_tn('class-1')
    metric.add_tp('class-2')

    assert(3 == len(metric.get_classes()))
    assert('class-1' in metric.get_classes())
    assert('class-2' in metric.get_classes())
    assert('class-3' in metric.get_classes())
Exemplo n.º 16
0
    def evaluate(
        self,
        data_loader: DataLoader,
        out_path: Path = None,
        embedding_storage_mode: str = "none",
    ) -> (Result, float):

        if type(out_path) == str:
            out_path = Path(out_path)

        with torch.no_grad():
            eval_loss = 0

            batch_no: int = 0

            metric = Metric("Evaluation", beta=self.beta)

            lines: List[str] = []

            if self.use_crf:
                transitions = self.transitions.detach().cpu().numpy()
            else:
                transitions = None

            for batch in data_loader:
                batch_no += 1

                with torch.no_grad():
                    features = self.forward(batch)
                    loss = self._calculate_loss(features, batch)
                    tags, _ = self._obtain_labels(
                        feature=features,
                        batch_sentences=batch,
                        transitions=transitions,
                        get_all_tags=False,
                    )

                eval_loss += loss

                for (sentence, sent_tags) in zip(batch, tags):
                    for (token, tag) in zip(sentence.tokens, sent_tags):
                        token: Token = token
                        token.add_tag("predicted", tag.value, tag.score)

                        # append both to file for evaluation
                        eval_line = "{} {} {} {}\n".format(
                            token.text,
                            token.get_tag(self.tag_type).value,
                            tag.value,
                            tag.score,
                        )
                        lines.append(eval_line)
                    lines.append("\n")
                for sentence in batch:
                    # make list of gold tags
                    gold_tags = [(tag.tag, str(tag))
                                 for tag in sentence.get_spans(self.tag_type)]
                    # make list of predicted tags
                    predicted_tags = [
                        (tag.tag, str(tag))
                        for tag in sentence.get_spans("predicted")
                    ]

                    # check for true positives, false positives and false negatives
                    for tag, prediction in predicted_tags:
                        if (tag, prediction) in gold_tags:
                            metric.add_tp(tag)
                        else:
                            metric.add_fp(tag)

                    for tag, gold in gold_tags:
                        if (tag, gold) not in predicted_tags:
                            metric.add_fn(tag)
                        else:
                            metric.add_tn(tag)

                store_embeddings(batch, embedding_storage_mode)

            eval_loss /= batch_no

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            return result, eval_loss
Exemplo n.º 17
0
    def evaluate(
        self,
        sentences: List[Sentence],
        eval_mini_batch_size: int = 32,
        embeddings_in_memory: bool = False,
        out_path: Path = None,
    ) -> (Result, float):

        with torch.no_grad():
            eval_loss = 0

            batches = [
                sentences[x:x + eval_mini_batch_size]
                for x in range(0, len(sentences), eval_mini_batch_size)
            ]

            metric = Metric("Evaluation")

            lines: List[str] = []
            for batch in batches:

                labels, loss = self.forward_labels_and_loss(batch)

                clear_embeddings(
                    batch, also_clear_word_embeddings=not embeddings_in_memory)

                eval_loss += loss

                sentences_for_batch = [
                    sent.to_plain_string() for sent in batch
                ]
                confidences_for_batch = [[
                    label.score for label in sent_labels
                ] for sent_labels in labels]
                predictions_for_batch = [[
                    label.value for label in sent_labels
                ] for sent_labels in labels]
                true_values_for_batch = [
                    sentence.get_label_names() for sentence in batch
                ]
                available_labels = self.label_dictionary.get_items()

                for sentence, confidence, prediction, true_value in zip(
                        sentences_for_batch,
                        confidences_for_batch,
                        predictions_for_batch,
                        true_values_for_batch,
                ):
                    eval_line = "{}\t{}\t{}\t{}\n".format(
                        sentence, true_value, prediction, confidence)
                    lines.append(eval_line)

                for predictions_for_sentence, true_values_for_sentence in zip(
                        predictions_for_batch, true_values_for_batch):

                    for label in available_labels:
                        if (label in predictions_for_sentence
                                and label in true_values_for_sentence):
                            metric.add_tp(label)
                        elif (label in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_fp(label)
                        elif (label not in predictions_for_sentence
                              and label in true_values_for_sentence):
                            metric.add_fn(label)
                        elif (label not in predictions_for_sentence
                              and label not in true_values_for_sentence):
                            metric.add_tn(label)

            eval_loss /= len(sentences)

            detailed_result = (
                f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}"
                f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}"
            )
            for class_name in metric.get_classes():
                detailed_result += (
                    f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - "
                    f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: "
                    f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - "
                    f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: "
                    f"{metric.f_score(class_name):.4f}")

            result = Result(
                main_score=metric.micro_avg_f_score(),
                log_line=
                f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}",
                log_header="PRECISION\tRECALL\tF1",
                detailed_results=detailed_result,
            )

            if out_path is not None:
                with open(out_path, "w", encoding="utf-8") as outfile:
                    outfile.write("".join(lines))

            return result, eval_loss