예제 #1
0
파일: wrapper.py 프로젝트: mhabsaoui/delft
    def eval_nfold(self, x_test, y_test):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            total_precision = 0
            total_recall = 0
            for i in range(0, self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) +
                      ' --------------------------------------')

                # Prepare test data(steps, generator)
                test_generator = DataGenerator(
                    x_test,
                    y_test,
                    batch_size=self.training_config.batch_size,
                    preprocessor=self.p,
                    char_embed_size=self.model_config.char_embedding_size,
                    max_sequence_length=self.model_config.max_sequence_length,
                    embeddings=self.embeddings,
                    shuffle=False)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.models[i]
                scorer.on_epoch_end(epoch=-1)
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            macro_f1 = total_f1 / self.model_config.fold_number
            macro_precision = total_precision / self.model_config.fold_number
            macro_recall = total_recall / self.model_config.fold_number

            print("\naverage over", self.model_config.fold_number, "folds")
            print("\tmacro f1 =", macro_f1)
            print("\tmacro precision =", macro_precision)
            print("\tmacro recall =", macro_recall, "\n")

            print("\n** Worst ** model scores - \n")
            print(reports[worst_index])

            self.model = self.models[best_index]
            print("\n** Best ** model scores - \n")
            print(reports[best_index])
예제 #2
0
    def eval_single(self, x_test, y_test):   
        if self.model:
            # Prepare test data(steps, generator)
            test_generator = DataGenerator(x_test, y_test, 
              batch_size=self.training_config.batch_size, preprocessor=self.p, 
              char_embed_size=self.model_config.char_embedding_size, 
              embeddings=self.embeddings, shuffle=False)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator, self.p, evaluation=True)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1) 
        else:
            raise (OSError('Could not find a model.'))
예제 #3
0
    def eval_single(self, x_test, y_test, features=None):
        if 'bert' not in self.model_config.model_type.lower():
            if self.model:
                # Prepare test data(steps, generator)
                test_generator = DataGenerator(
                    x_test,
                    y_test,
                    batch_size=self.model_config.batch_size,
                    preprocessor=self.p,
                    char_embed_size=self.model_config.char_embedding_size,
                    max_sequence_length=self.model_config.max_sequence_length,
                    embeddings=self.embeddings,
                    shuffle=False,
                    features=features)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.model
                scorer.on_epoch_end(epoch=-1)
            else:
                raise (OSError('Could not find a model.'))
        else:
            # BERT architecture model
            y_pred = self.model.predict(x_test, fold_id=-1)

            nb_alignment_issues = 0
            for i in range(len(y_test)):
                if len(y_test[i]) != len(y_pred[i]):
                    nb_alignment_issues += 1
                    # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                    # but this is normally handled when predicting.
                    # To be very conservative, the following ensure the number of tokens always
                    # match, but it should never be used in practice.
                    if len(y_test[i]) < len(y_pred[i]):
                        y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) -
                                                         len(y_test[i]))
                    if len(y_test[i]) > len(y_pred[i]):
                        y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) -
                                                         len(y_pred[i]))

            if nb_alignment_issues > 0:
                print("number of alignment issues with test set:",
                      nb_alignment_issues)

            report, report_as_map = classification_report(y_test,
                                                          y_pred,
                                                          digits=4)
            print(report)
예제 #4
0
    def eval_nfold(self, x_test, y_test, features=None):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            reports_as_map = []
            total_precision = 0
            total_recall = 0
            for i in range(self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) +
                      ' --------------------------------------')

                if 'bert' not in self.model_config.model_type.lower():
                    # Prepare test data(steps, generator)
                    test_generator = DataGenerator(
                        x_test,
                        y_test,
                        batch_size=self.model_config.batch_size,
                        preprocessor=self.p,
                        char_embed_size=self.model_config.char_embedding_size,
                        max_sequence_length=self.model_config.
                        max_sequence_length,
                        embeddings=self.embeddings,
                        shuffle=False,
                        features=features)

                    # Build the evaluator and evaluate the model
                    scorer = Scorer(test_generator, self.p, evaluation=True)
                    scorer.model = self.models[i]
                    scorer.on_epoch_end(epoch=-1)
                    f1 = scorer.f1
                    precision = scorer.precision
                    recall = scorer.recall
                    reports.append(scorer.report)
                    reports_as_map.append(scorer.report_as_map)

                else:
                    # BERT architecture model
                    dir_path = 'data/models/sequenceLabelling/'
                    self.model_config = ModelConfig.load(
                        os.path.join(dir_path, self.model_config.model_name,
                                     self.config_file))
                    self.p = WordPreprocessor.load(
                        os.path.join(dir_path, self.model_config.model_name,
                                     self.preprocessor_file))
                    self.model = get_model(self.model_config,
                                           self.p,
                                           ntags=len(self.p.vocab_tag))
                    self.model.load_model(i)

                    y_pred = self.model.predict(x_test, fold_id=i)

                    nb_alignment_issues = 0
                    for j in range(len(y_test)):
                        if len(y_test[i]) != len(y_pred[j]):
                            nb_alignment_issues += 1
                            # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                            # but this is normally handled when predicting.
                            # To be very conservative, the following ensure the number of tokens always
                            # match, but it should never be used in practice.
                            if len(y_test[j]) < len(y_pred[j]):
                                y_test[j] = y_test[j] + ["O"] * (
                                    len(y_pred[j]) - len(y_test[j]))
                            if len(y_test[j]) > len(y_pred[j]):
                                y_pred[j] = y_pred[j] + ["O"] * (
                                    len(y_test[j]) - len(y_pred[j]))

                    if nb_alignment_issues > 0:
                        print("number of alignment issues with test set:",
                              nb_alignment_issues)

                    f1 = f1_score(y_test, y_pred)
                    precision = precision_score(y_test, y_pred)
                    recall = recall_score(y_test, y_pred)

                    print("\tf1: {:04.2f}".format(f1 * 100))
                    print("\tprecision: {:04.2f}".format(precision * 100))
                    print("\trecall: {:04.2f}".format(recall * 100))

                    report, report_as_map = classification_report(y_test,
                                                                  y_pred,
                                                                  digits=4)
                    reports.append(report)
                    reports_as_map.append(report_as_map)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}}

            micro_f1 = total_f1 / self.model_config.fold_number
            micro_precision = total_precision / self.model_config.fold_number
            micro_recall = total_recall / self.model_config.fold_number

            micro_eval_block = {
                'f1': micro_f1,
                'precision': micro_precision,
                'recall': micro_recall
            }
            fold_average_evaluation['micro'] = micro_eval_block

            # field-level average over the n folds
            labels = []
            for label in sorted(self.p.vocab_tag):
                if label == 'O' or label == '<PAD>':
                    continue
                if label.startswith("B-") or label.startswith(
                        "S-") or label.startswith("I-") or label.startswith(
                            "E-"):
                    label = label[2:]

                if label in labels:
                    continue
                labels.append(label)

                sum_p = 0
                sum_r = 0
                sum_f1 = 0
                sum_support = 0
                for j in range(0, self.model_config.fold_number):
                    if not label in reports_as_map[j]['labels']:
                        continue
                    report_as_map = reports_as_map[j]['labels'][label]
                    sum_p += report_as_map["precision"]
                    sum_r += report_as_map["recall"]
                    sum_f1 += report_as_map["f1"]
                    sum_support += report_as_map["support"]

                avg_p = sum_p / self.model_config.fold_number
                avg_r = sum_r / self.model_config.fold_number
                avg_f1 = sum_f1 / self.model_config.fold_number
                avg_support = sum_support / self.model_config.fold_number
                avg_support_dec = str(avg_support - int(avg_support))[1:]
                if avg_support_dec != '0':
                    avg_support = math.floor(avg_support)

                block_label = {
                    'precision': avg_p,
                    'recall': avg_r,
                    'support': avg_support,
                    'f1': avg_f1
                }
                fold_average_evaluation['labels'][label] = block_label

            print(
                "----------------------------------------------------------------------"
            )
            print("\n** Worst ** model scores - run", str(worst_index))
            print(reports[worst_index])

            print("\n** Best ** model scores - run", str(best_index))
            print(reports[best_index])

            if 'bert' not in self.model_config.model_type.lower():
                self.model = self.models[best_index]
            else:
                # copy best BERT model fold_number
                best_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name + str(
                    best_index)
                new_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name
                # update new_model_dir if it already exists, keep its existing config content
                merge_folders(best_model_dir, new_model_dir)
                # clean other fold directory
                for i in range(self.model_config.fold_number):
                    shutil.rmtree('data/models/sequenceLabelling/' +
                                  self.model_config.model_name + str(i))

            print(
                "----------------------------------------------------------------------"
            )
            print("\nAverage over", self.model_config.fold_number, "folds")
            print(
                get_report(fold_average_evaluation,
                           digits=4,
                           include_avgs=['micro']))
예제 #5
0
    def eval_nfold(self, x_test, y_test, features=None):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            reports_as_map = []
            total_precision = 0
            total_recall = 0
            for i in range(self.model_config.fold_number):

                if self.model_config.transformer_name is None:
                    the_model = self.models[i]
                    bert_preprocessor = None
                else:
                    # the architecture model uses a transformer layer, it is large and needs to be loaded from disk
                    dir_path = 'data/models/sequenceLabelling/'
                    weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                        ".hdf5",
                        str(i) + ".hdf5")
                    self.model = get_model(self.model_config,
                                           self.p,
                                           ntags=len(self.p.vocab_tag),
                                           load_pretrained_weights=False,
                                           local_path=os.path.join(
                                               dir_path,
                                               self.model_config.model_name))
                    self.model.load(filepath=os.path.join(
                        dir_path, self.model_config.model_name, weight_file))
                    the_model = self.model
                    bert_preprocessor = self.model.transformer_preprocessor

                if i == 0:
                    the_model.print_summary()
                    print_parameters(self.model_config, self.training_config)

                print('\n------------------------ fold ' + str(i) +
                      ' --------------------------------------')

                # we can use a data generator for evaluation
                # Prepare test data(steps, generator)
                generator = the_model.get_generator()
                test_generator = generator(
                    x_test,
                    y_test,
                    batch_size=self.model_config.batch_size,
                    preprocessor=self.p,
                    bert_preprocessor=bert_preprocessor,
                    char_embed_size=self.model_config.char_embedding_size,
                    max_sequence_length=self.model_config.max_sequence_length,
                    embeddings=self.embeddings,
                    shuffle=False,
                    features=features,
                    output_input_offsets=True,
                    use_chain_crf=self.model_config.use_chain_crf)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator,
                                self.p,
                                evaluation=True,
                                use_crf=self.model_config.use_crf,
                                use_chain_crf=self.model_config.use_chain_crf)
                scorer.model = the_model
                scorer.on_epoch_end(epoch=-1)
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)
                reports_as_map.append(scorer.report_as_map)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}}

            micro_f1 = total_f1 / self.model_config.fold_number
            micro_precision = total_precision / self.model_config.fold_number
            micro_recall = total_recall / self.model_config.fold_number

            micro_eval_block = {
                'f1': micro_f1,
                'precision': micro_precision,
                'recall': micro_recall
            }
            fold_average_evaluation['micro'] = micro_eval_block

            # field-level average over the n folds
            labels = []
            for label in sorted(self.p.vocab_tag):
                if label == 'O' or label == '<PAD>':
                    continue
                if label.startswith("B-") or label.startswith(
                        "S-") or label.startswith("I-") or label.startswith(
                            "E-"):
                    label = label[2:]

                if label in labels:
                    continue
                labels.append(label)

                sum_p = 0
                sum_r = 0
                sum_f1 = 0
                sum_support = 0
                for j in range(0, self.model_config.fold_number):
                    if label not in reports_as_map[j]['labels']:
                        continue
                    report_as_map = reports_as_map[j]['labels'][label]
                    sum_p += report_as_map["precision"]
                    sum_r += report_as_map["recall"]
                    sum_f1 += report_as_map["f1"]
                    sum_support += report_as_map["support"]

                avg_p = sum_p / self.model_config.fold_number
                avg_r = sum_r / self.model_config.fold_number
                avg_f1 = sum_f1 / self.model_config.fold_number
                avg_support = sum_support / self.model_config.fold_number
                avg_support_dec = str(avg_support - int(avg_support))[1:]
                if avg_support_dec != '0':
                    avg_support = math.floor(avg_support)

                block_label = {
                    'precision': avg_p,
                    'recall': avg_r,
                    'support': avg_support,
                    'f1': avg_f1
                }
                fold_average_evaluation['labels'][label] = block_label

            print(
                "----------------------------------------------------------------------"
            )
            print("\n** Worst ** model scores - run", str(worst_index))
            print(reports[worst_index])

            print("\n** Best ** model scores - run", str(best_index))
            print(reports[best_index])

            fold_nb = self.model_config.fold_number
            self.model_config.fold_number = 1
            if self.model_config.transformer_name is None:
                self.model = self.models[best_index]
            else:
                dir_path = 'data/models/sequenceLabelling/'
                weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                    ".hdf5",
                    str(best_index) + ".hdf5")
                # saved config file must be updated to single fold
                self.model.load(filepath=os.path.join(
                    dir_path, self.model_config.model_name, weight_file))

            print(
                "----------------------------------------------------------------------"
            )
            print("\nAverage over", str(int(fold_nb)), "folds")
            print(
                get_report(fold_average_evaluation,
                           digits=4,
                           include_avgs=['micro']))
예제 #6
0
    def eval_single(self, x_test, y_test, features=None):
        if self.model is None:
            raise (OSError('Could not find a model.'))
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        if self.model_config.transformer_name is None:
            # we can use a data generator for evaluation

            # Prepare test data(steps, generator)
            generator = self.model.get_generator()
            test_generator = generator(
                x_test,
                y_test,
                batch_size=self.model_config.batch_size,
                preprocessor=self.p,
                char_embed_size=self.model_config.char_embedding_size,
                max_sequence_length=self.model_config.max_sequence_length,
                embeddings=self.embeddings,
                shuffle=False,
                features=features,
                output_input_offsets=True,
                use_chain_crf=self.model_config.use_chain_crf)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator,
                            self.p,
                            evaluation=True,
                            use_crf=self.model_config.use_crf,
                            use_chain_crf=self.model_config.use_chain_crf)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1)
        else:
            # the architecture model uses a transformer layer
            # note that we could also use the above test_generator, but as an alternative here we check the
            # test/prediction alignment of tokens and the validity of the maximum sequence input length
            # wrt the length of the test sequences

            tagger = Tagger(
                self.model,
                self.model_config,
                self.embeddings,
                preprocessor=self.p,
                transformer_preprocessor=self.model.transformer_preprocessor)
            y_pred_pairs = tagger.tag(x_test,
                                      output_format=None,
                                      features=features)

            # keep only labels
            y_pred = []
            for result in y_pred_pairs:
                result_labels = []
                for pair in result:
                    result_labels.append(pair[1])
                y_pred.append(result_labels)

            nb_alignment_issues = 0
            for i in range(len(y_test)):
                if len(y_test[i]) != len(y_pred[i]):
                    #print("y_test:", y_test[i])
                    #print("y_pred:", y_pred[i])

                    nb_alignment_issues += 1
                    # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                    # but we normally handled that well when predicting.
                    # To be very conservative, the following ensure the number of tokens always
                    # match, but it should never be used in practice.
                    if len(y_test[i]) < len(y_pred[i]):
                        y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) -
                                                         len(y_test[i]))
                    if len(y_test[i]) > len(y_pred[i]):
                        y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) -
                                                         len(y_pred[i]))

            if nb_alignment_issues > 0:
                print("number of alignment issues with test set:",
                      nb_alignment_issues)
                print(
                    "to solve them consider increasing the maximum sequence input length of the model and retrain"
                )

            report, report_as_map = classification_report(y_test,
                                                          y_pred,
                                                          digits=4)
            print(report)