def eval_nfold(self, x_test, y_test): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] total_precision = 0 total_recall = 0 for i in range(0, self.model_config.fold_number): print('\n------------------------ fold ' + str(i) + ' --------------------------------------') # Prepare test data(steps, generator) test_generator = DataGenerator( x_test, y_test, batch_size=self.training_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.models[i] scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall macro_f1 = total_f1 / self.model_config.fold_number macro_precision = total_precision / self.model_config.fold_number macro_recall = total_recall / self.model_config.fold_number print("\naverage over", self.model_config.fold_number, "folds") print("\tmacro f1 =", macro_f1) print("\tmacro precision =", macro_precision) print("\tmacro recall =", macro_recall, "\n") print("\n** Worst ** model scores - \n") print(reports[worst_index]) self.model = self.models[best_index] print("\n** Best ** model scores - \n") print(reports[best_index])
def eval_single(self, x_test, y_test): if self.model: # Prepare test data(steps, generator) test_generator = DataGenerator(x_test, y_test, batch_size=self.training_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, shuffle=False) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.model scorer.on_epoch_end(epoch=-1) else: raise (OSError('Could not find a model.'))
def eval_single(self, x_test, y_test, features=None): if 'bert' not in self.model_config.model_type.lower(): if self.model: # Prepare test data(steps, generator) test_generator = DataGenerator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.model scorer.on_epoch_end(epoch=-1) else: raise (OSError('Could not find a model.')) else: # BERT architecture model y_pred = self.model.predict(x_test, fold_id=-1) nb_alignment_issues = 0 for i in range(len(y_test)): if len(y_test[i]) != len(y_pred[i]): nb_alignment_issues += 1 # BERT tokenizer appears to introduce some additional tokens without ## prefix, # but this is normally handled when predicting. # To be very conservative, the following ensure the number of tokens always # match, but it should never be used in practice. if len(y_test[i]) < len(y_pred[i]): y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) - len(y_test[i])) if len(y_test[i]) > len(y_pred[i]): y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) - len(y_pred[i])) if nb_alignment_issues > 0: print("number of alignment issues with test set:", nb_alignment_issues) report, report_as_map = classification_report(y_test, y_pred, digits=4) print(report)
def eval_nfold(self, x_test, y_test, features=None): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] reports_as_map = [] total_precision = 0 total_recall = 0 for i in range(self.model_config.fold_number): print('\n------------------------ fold ' + str(i) + ' --------------------------------------') if 'bert' not in self.model_config.model_type.lower(): # Prepare test data(steps, generator) test_generator = DataGenerator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config. max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.models[i] scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) reports_as_map.append(scorer.report_as_map) else: # BERT architecture model dir_path = 'data/models/sequenceLabelling/' self.model_config = ModelConfig.load( os.path.join(dir_path, self.model_config.model_name, self.config_file)) self.p = WordPreprocessor.load( os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load_model(i) y_pred = self.model.predict(x_test, fold_id=i) nb_alignment_issues = 0 for j in range(len(y_test)): if len(y_test[i]) != len(y_pred[j]): nb_alignment_issues += 1 # BERT tokenizer appears to introduce some additional tokens without ## prefix, # but this is normally handled when predicting. # To be very conservative, the following ensure the number of tokens always # match, but it should never be used in practice. if len(y_test[j]) < len(y_pred[j]): y_test[j] = y_test[j] + ["O"] * ( len(y_pred[j]) - len(y_test[j])) if len(y_test[j]) > len(y_pred[j]): y_pred[j] = y_pred[j] + ["O"] * ( len(y_test[j]) - len(y_pred[j])) if nb_alignment_issues > 0: print("number of alignment issues with test set:", nb_alignment_issues) f1 = f1_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) print("\tf1: {:04.2f}".format(f1 * 100)) print("\tprecision: {:04.2f}".format(precision * 100)) print("\trecall: {:04.2f}".format(recall * 100)) report, report_as_map = classification_report(y_test, y_pred, digits=4) reports.append(report) reports_as_map.append(report_as_map) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}} micro_f1 = total_f1 / self.model_config.fold_number micro_precision = total_precision / self.model_config.fold_number micro_recall = total_recall / self.model_config.fold_number micro_eval_block = { 'f1': micro_f1, 'precision': micro_precision, 'recall': micro_recall } fold_average_evaluation['micro'] = micro_eval_block # field-level average over the n folds labels = [] for label in sorted(self.p.vocab_tag): if label == 'O' or label == '<PAD>': continue if label.startswith("B-") or label.startswith( "S-") or label.startswith("I-") or label.startswith( "E-"): label = label[2:] if label in labels: continue labels.append(label) sum_p = 0 sum_r = 0 sum_f1 = 0 sum_support = 0 for j in range(0, self.model_config.fold_number): if not label in reports_as_map[j]['labels']: continue report_as_map = reports_as_map[j]['labels'][label] sum_p += report_as_map["precision"] sum_r += report_as_map["recall"] sum_f1 += report_as_map["f1"] sum_support += report_as_map["support"] avg_p = sum_p / self.model_config.fold_number avg_r = sum_r / self.model_config.fold_number avg_f1 = sum_f1 / self.model_config.fold_number avg_support = sum_support / self.model_config.fold_number avg_support_dec = str(avg_support - int(avg_support))[1:] if avg_support_dec != '0': avg_support = math.floor(avg_support) block_label = { 'precision': avg_p, 'recall': avg_r, 'support': avg_support, 'f1': avg_f1 } fold_average_evaluation['labels'][label] = block_label print( "----------------------------------------------------------------------" ) print("\n** Worst ** model scores - run", str(worst_index)) print(reports[worst_index]) print("\n** Best ** model scores - run", str(best_index)) print(reports[best_index]) if 'bert' not in self.model_config.model_type.lower(): self.model = self.models[best_index] else: # copy best BERT model fold_number best_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name + str( best_index) new_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name # update new_model_dir if it already exists, keep its existing config content merge_folders(best_model_dir, new_model_dir) # clean other fold directory for i in range(self.model_config.fold_number): shutil.rmtree('data/models/sequenceLabelling/' + self.model_config.model_name + str(i)) print( "----------------------------------------------------------------------" ) print("\nAverage over", self.model_config.fold_number, "folds") print( get_report(fold_average_evaluation, digits=4, include_avgs=['micro']))
def eval_nfold(self, x_test, y_test, features=None): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] reports_as_map = [] total_precision = 0 total_recall = 0 for i in range(self.model_config.fold_number): if self.model_config.transformer_name is None: the_model = self.models[i] bert_preprocessor = None else: # the architecture model uses a transformer layer, it is large and needs to be loaded from disk dir_path = 'data/models/sequenceLabelling/' weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(i) + ".hdf5") self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), load_pretrained_weights=False, local_path=os.path.join( dir_path, self.model_config.model_name)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) the_model = self.model bert_preprocessor = self.model.transformer_preprocessor if i == 0: the_model.print_summary() print_parameters(self.model_config, self.training_config) print('\n------------------------ fold ' + str(i) + ' --------------------------------------') # we can use a data generator for evaluation # Prepare test data(steps, generator) generator = the_model.get_generator() test_generator = generator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, bert_preprocessor=bert_preprocessor, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True, use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) scorer.model = the_model scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) reports_as_map.append(scorer.report_as_map) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}} micro_f1 = total_f1 / self.model_config.fold_number micro_precision = total_precision / self.model_config.fold_number micro_recall = total_recall / self.model_config.fold_number micro_eval_block = { 'f1': micro_f1, 'precision': micro_precision, 'recall': micro_recall } fold_average_evaluation['micro'] = micro_eval_block # field-level average over the n folds labels = [] for label in sorted(self.p.vocab_tag): if label == 'O' or label == '<PAD>': continue if label.startswith("B-") or label.startswith( "S-") or label.startswith("I-") or label.startswith( "E-"): label = label[2:] if label in labels: continue labels.append(label) sum_p = 0 sum_r = 0 sum_f1 = 0 sum_support = 0 for j in range(0, self.model_config.fold_number): if label not in reports_as_map[j]['labels']: continue report_as_map = reports_as_map[j]['labels'][label] sum_p += report_as_map["precision"] sum_r += report_as_map["recall"] sum_f1 += report_as_map["f1"] sum_support += report_as_map["support"] avg_p = sum_p / self.model_config.fold_number avg_r = sum_r / self.model_config.fold_number avg_f1 = sum_f1 / self.model_config.fold_number avg_support = sum_support / self.model_config.fold_number avg_support_dec = str(avg_support - int(avg_support))[1:] if avg_support_dec != '0': avg_support = math.floor(avg_support) block_label = { 'precision': avg_p, 'recall': avg_r, 'support': avg_support, 'f1': avg_f1 } fold_average_evaluation['labels'][label] = block_label print( "----------------------------------------------------------------------" ) print("\n** Worst ** model scores - run", str(worst_index)) print(reports[worst_index]) print("\n** Best ** model scores - run", str(best_index)) print(reports[best_index]) fold_nb = self.model_config.fold_number self.model_config.fold_number = 1 if self.model_config.transformer_name is None: self.model = self.models[best_index] else: dir_path = 'data/models/sequenceLabelling/' weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(best_index) + ".hdf5") # saved config file must be updated to single fold self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) print( "----------------------------------------------------------------------" ) print("\nAverage over", str(int(fold_nb)), "folds") print( get_report(fold_average_evaluation, digits=4, include_avgs=['micro']))
def eval_single(self, x_test, y_test, features=None): if self.model is None: raise (OSError('Could not find a model.')) print_parameters(self.model_config, self.training_config) self.model.print_summary() if self.model_config.transformer_name is None: # we can use a data generator for evaluation # Prepare test data(steps, generator) generator = self.model.get_generator() test_generator = generator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True, use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) scorer.model = self.model scorer.on_epoch_end(epoch=-1) else: # the architecture model uses a transformer layer # note that we could also use the above test_generator, but as an alternative here we check the # test/prediction alignment of tokens and the validity of the maximum sequence input length # wrt the length of the test sequences tagger = Tagger( self.model, self.model_config, self.embeddings, preprocessor=self.p, transformer_preprocessor=self.model.transformer_preprocessor) y_pred_pairs = tagger.tag(x_test, output_format=None, features=features) # keep only labels y_pred = [] for result in y_pred_pairs: result_labels = [] for pair in result: result_labels.append(pair[1]) y_pred.append(result_labels) nb_alignment_issues = 0 for i in range(len(y_test)): if len(y_test[i]) != len(y_pred[i]): #print("y_test:", y_test[i]) #print("y_pred:", y_pred[i]) nb_alignment_issues += 1 # BERT tokenizer appears to introduce some additional tokens without ## prefix, # but we normally handled that well when predicting. # To be very conservative, the following ensure the number of tokens always # match, but it should never be used in practice. if len(y_test[i]) < len(y_pred[i]): y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) - len(y_test[i])) if len(y_test[i]) > len(y_pred[i]): y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) - len(y_pred[i])) if nb_alignment_issues > 0: print("number of alignment issues with test set:", nb_alignment_issues) print( "to solve them consider increasing the maximum sequence input length of the model and retrain" ) report, report_as_map = classification_report(y_test, y_pred, digits=4) print(report)