def load(self, dir_path='data/models/sequenceLabelling/'): self.model_config = ModelConfig.load( os.path.join(dir_path, self.model_config.model_name, self.config_file)) self.p = WordPreprocessor.load( os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) if self.model_config.model_type.lower().find("bert") != -1: self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), dir_path=dir_path) self.model.load_model() return # load embeddings # Do not use cache in 'production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo, use_BERT=self.model_config.use_BERT, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, self.weight_file))
def load(self, dir_path='data/models/sequenceLabelling/', weight_file=DEFAULT_WEIGHT_FILE_NAME): model_path = os.path.join(dir_path, self.model_config.model_name) self.model_config = ModelConfig.load( os.path.join(model_path, CONFIG_FILE_NAME)) if self.model_config.embeddings_name is not None: # load embeddings # Do not use cache in 'prediction/production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_ELMo=self.model_config.use_ELMo, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size else: self.embeddings = None self.model_config.word_embedding_size = 0 self.p = Preprocessor.load( os.path.join(dir_path, self.model_config.model_name, PROCESSOR_FILE_NAME)) self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), load_pretrained_weights=False, local_path=os.path.join( dir_path, self.model_config.model_name)) print( "load weights from", os.path.join(dir_path, self.model_config.model_name, weight_file)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) self.model.print_summary()
def load(self, dir_path='data/models/textClassification/'): self.model_config = ModelConfig.load( os.path.join(dir_path, self.model_config.model_name, self.config_file)) # load embeddings self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo, use_BERT=self.model_config.use_BERT) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = getModel(self.model_config, self.training_config) if self.model_config.fold_number is 1: self.model.load_weights( os.path.join( dir_path, self.model_config.model_name, self.model_config.model_type + "." + self.weight_file)) else: self.models = [] for i in range(0, self.model_config.fold_number): local_model = getModel(self.model_config, self.training_config) local_model.load_weights( os.path.join( dir_path, self.model_config.model_name, self.model_config.model_type + ".model{0}_weights.hdf5".format(i))) self.models.append(local_model)
def __init__(self, model_name, model_type="BidLSTM_CRF", embeddings_name=None, char_emb_size=25, max_char_length=30, char_lstm_units=25, word_lstm_units=100, dropout=0.5, recurrent_dropout=0.25, use_char_feature=True, use_crf=True, batch_size=20, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, early_stop=True, patience=5, max_checkpoints_to_keep=5, log_dir=None, use_ELMo=True, fold_number=1): self.model = None self.models = None self.p = None self.log_dir = log_dir self.embeddings_name = embeddings_name word_emb_size = 0 if embeddings_name is not None: self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo) word_emb_size = self.embeddings.embed_size self.model_config = ModelConfig(model_name=model_name, model_type=model_type, embeddings_name=embeddings_name, word_embedding_size=word_emb_size, char_emb_size=char_emb_size, char_lstm_units=char_lstm_units, max_char_length=max_char_length, word_lstm_units=word_lstm_units, dropout=dropout, recurrent_dropout=recurrent_dropout, use_char_feature=use_char_feature, use_crf=use_crf, fold_number=fold_number, batch_size=batch_size, use_ELMo=use_ELMo) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, early_stop, patience, max_checkpoints_to_keep)
def load(self, dir_path='data/models/sequenceLabelling/'): self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file)) # load embeddings self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))
def __init__(self, model_name="", model_type="gru", embeddings_name=None, list_classes=[], char_emb_size=25, dropout=0.5, recurrent_dropout=0.25, use_char_feature=False, batch_size=256, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, patience=5, log_dir=None, maxlen=300, fold_number=1, use_roc_auc=True, use_ELMo=False, use_BERT=False, embeddings=(), class_weights=None, multiprocessing=True): self.model = None self.models = None self.log_dir = log_dir self.embeddings_name = embeddings_name word_emb_size = 0 if embeddings_name is not None and model_type.find("bert") == -1: self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT) word_emb_size = self.embeddings.embed_size self.model_config = ModelConfig(model_name=model_name, model_type=model_type, embeddings_name=embeddings_name, list_classes=list_classes, char_emb_size=char_emb_size, word_emb_size=word_emb_size, dropout=dropout, recurrent_dropout=recurrent_dropout, use_char_feature=use_char_feature, maxlen=maxlen, fold_number=fold_number, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, patience, use_roc_auc, class_weights=class_weights, multiprocessing=multiprocessing)
def preload(embeddings_name, input_path=None): embeddings = Embeddings(embeddings_name, path='./embedding-registry.json', load=False) description = embeddings.get_description(embeddings_name) if description is None: print("Error: embedding name", embeddings_name, "is not registered in", path) if input_path is None: embeddings_path = None # download if url is available if description is not None and "url" in description and len(description["url"])>0: url = description["url"] download_path = embeddings.registry['embedding-download-path'] # if the download path does not exist, we create it if not os.path.isdir(download_path): try: os.mkdir(download_path) except OSError: print ("Creation of the download directory", download_path, "failed") print("Downloading resource file for", embeddings_name, "...") embeddings_path = download_file(url, download_path) if embeddings_path != None and os.path.isfile(embeddings_path): print("Download sucessful:", embeddings_path) else: print("Embeddings resource is not specified in the embeddings registry:", embeddings_name) else: embeddings_path = input_path if embeddings_path == None: print("Fail to retrive embedding file for", embeddings_name) embedding_file = open_embedding_file(embeddings_path) if embedding_file is None: print("Error: could not open embeddings file", embeddings_path) return # create and load the database in write mode embedding_lmdb_path = embeddings.registry["embedding-lmdb-path"] if not os.path.isdir(embedding_lmdb_path): os.makedirs(embedding_lmdb_path) envFilePath = os.path.join(embedding_lmdb_path, embeddings_name) embeddings.env = lmdb.open(envFilePath, map_size=map_size) embeddings.load_embeddings_from_file(embeddings_path) embeddings.clean_downloads()
def load(self, dir_path='data/models/textClassification/'): model_path = os.path.join(dir_path, self.model_config.model_name) self.model_config = ModelConfig.load(os.path.join(model_path, self.config_file)) if self.model_config.transformer_name is None: # load embeddings # Do not use cache in 'production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size else: self.transformer_name = self.model_config.transformer_name self.embeddings = None self.model = getModel(self.model_config, self.training_config, load_pretrained_weights=False, local_path=model_path) print_parameters(self.model_config, self.training_config) self.model.print_summary() if self.model_config.fold_number == 1: print("load weights from", os.path.join(model_path, self.weight_file)) self.model.load(os.path.join(model_path, self.weight_file)) else: self.models = [] if self.model_config.transformer_name is None: for i in range(0, self.model_config.fold_number): local_model = getModel(self.model_config, self.training_config, load_pretrained_weights=False, local_path=model_path) local_model.load(os.path.join(model_path, "model{0}_weights.hdf5".format(i))) self.models.append(local_model) else: # only init first fold one, the other will be init at prediction time, all weights will be loaded at prediction time local_model = getModel(self.model_config, self.training_config, load_pretrained_weights=False, local_path=model_path) self.models.append(local_model)
class Sequence(object): config_file = 'config.json' weight_file = 'model_weights.hdf5' preprocessor_file = 'preprocessor.pkl' # number of parallel worker for the data generator when not using ELMo nb_workers = 6 def __init__(self, model_name, model_type="BidLSTM_CRF", embeddings_name=None, char_emb_size=25, max_char_length=30, char_lstm_units=25, word_lstm_units=100, dropout=0.5, recurrent_dropout=0.25, use_char_feature=True, use_crf=True, batch_size=20, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, early_stop=True, patience=5, max_checkpoints_to_keep=5, log_dir=None, use_ELMo=True, fold_number=1): self.model = None self.models = None self.p = None self.log_dir = log_dir self.embeddings_name = embeddings_name word_emb_size = 0 if embeddings_name is not None: self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo) word_emb_size = self.embeddings.embed_size self.model_config = ModelConfig(model_name=model_name, model_type=model_type, embeddings_name=embeddings_name, word_embedding_size=word_emb_size, char_emb_size=char_emb_size, char_lstm_units=char_lstm_units, max_char_length=max_char_length, word_lstm_units=word_lstm_units, dropout=dropout, recurrent_dropout=recurrent_dropout, use_char_feature=use_char_feature, use_crf=use_crf, fold_number=fold_number, batch_size=batch_size, use_ELMo=use_ELMo) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, early_stop, patience, max_checkpoints_to_keep) def train(self, x_train, y_train, x_valid=None, y_valid=None): # TBD if valid is None, segment train to get one x_all = np.concatenate((x_train, x_valid), axis=0) y_all = np.concatenate((y_train, y_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) """ if self.embeddings.use_ELMo: # dump token context independent data for the train set, done once for the training x_train_local = x_train if not self.training_config.early_stop: # in case we want to train with the validation set too, we dump also # the ELMo embeddings for the token of the valid set x_train_local = np.concatenate((x_train, x_valid), axis=0) self.embeddings.dump_ELMo_token_embeddings(x_train_local) """ self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p ) trainer.train(x_train, y_train, x_valid, y_valid) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, fold_number=10): if x_valid is not None and y_valid is not None: x_all = np.concatenate((x_train, x_valid), axis=0) y_all = np.concatenate((y_train, y_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, self.model_config) else: self.p = prepare_preprocessor(x_train, y_train, self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.p.return_lengths = True #self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) self.models = [] for k in range(0, fold_number): model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) self.models.append(model) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p ) trainer.train_nfold(x_train, y_train, x_valid, y_valid) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() def eval(self, x_test, y_test): if self.model_config.fold_number > 1 and self.models and len(self.models) == self.model_config.fold_number: self.eval_nfold(x_test, y_test) else: self.eval_single(x_test, y_test) def eval_single(self, x_test, y_test): if self.model: # Prepare test data(steps, generator) test_generator = DataGenerator(x_test, y_test, batch_size=self.training_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, shuffle=False) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.model scorer.on_epoch_end(epoch=-1) else: raise (OSError('Could not find a model.')) def eval_nfold(self, x_test, y_test): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] total_precision = 0 total_recall = 0 for i in range(0, self.model_config.fold_number): print('\n------------------------ fold ' + str(i) + '--------------------------------------') # Prepare test data(steps, generator) test_generator = DataGenerator(x_test, y_test, batch_size=self.training_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, shuffle=False) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.models[i] scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall macro_f1 = total_f1 / self.model_config.fold_number macro_precision = total_precision / self.model_config.fold_number macro_recall = total_recall / self.model_config.fold_number print("\naverage over", self.model_config.fold_number, "folds") print("\tmacro f1 =", macro_f1) print("\tmacro precision =", macro_precision) print("\tmacro recall =", macro_recall, "\n") print("\n** Worst ** model scores - \n") print(reports[worst_index]) self.model = self.models[best_index] print("\n** Best ** model scores - \n") print(reports[best_index]) def tag(self, texts, output_format): # annotate a list of sentences, return the list of annotations in the # specified output_format if self.model: tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p) start_time = time.time() annotations = tagger.tag(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format is 'json': annotations["runtime"] = runtime #else: # print("runtime: %s seconds " % (runtime)) return annotations else: raise (OSError('Could not find a model.')) def tag_file(self, file_in, output_format, file_out): # Annotate a text file containing one sentence per line, the annotations are # written in the output file if not None, in the standard output otherwise. # Processing is streamed by batches so that we can process huge files without # memory issues if self.model: tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p) start_time = time.time() if file_out is not None: out = open(file_out,'w') first = True with open(file_in, 'r') as f: texts = None while texts is None or len(texts) == self.model_config.batch_size * self.nb_workers: texts = next_n_lines(f, self.model_config.batch_size * self.nb_workers) annotations = tagger.tag(texts, output_format) # if the following is true, we just output the JSON returned by the tagger without any modification directDump = False if first: first = False if len(texts) < self.model_config.batch_size * self.nb_workers: runtime = round(time.time() - start_time, 3) annotations['runtime'] = runtime jsonString = json.dumps(annotations, sort_keys=False, indent=4, ensure_ascii=False) if file_out is None: print(jsonString) else: out.write(jsonString) directDump = True else: # we need to modify a bit the JSON outputted by the tagger to glue the different batches # output the general information attributes jsonString = '{\n "software": ' + json.dumps(annotations["software"], ensure_ascii=False) + ",\n" jsonString += ' "date": ' + json.dumps(annotations["date"], ensure_ascii=False) + ",\n" jsonString += ' "model": ' + json.dumps(annotations["model"], ensure_ascii=False) + ",\n" jsonString += ' "texts": [' if file_out is None: print(jsonString, end='', flush=True) else: out.write(jsonString) first = True for jsonStr in annotations["texts"]: jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False) #jsonString = jsonString.replace('\n', '\n\t\t') jsonString = re.sub('\n', '\n ', jsonString) if file_out is None: if not first: print(',\n '+jsonString, end='', flush=True) else: first = False print('\n '+jsonString, end='', flush=True) else: if not first: out.write(',\n ') out.write(jsonString) else: first = False out.write('\n ') out.write(jsonString) else: for jsonStr in annotations["texts"]: jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False) jsonString = re.sub('\n', '\n ', jsonString) if file_out is None: print(',\n '+jsonString, end='', flush=True) else: out.write(',\n ') out.write(jsonString) runtime = round(time.time() - start_time, 3) if not directDump: jsonString = "\n ],\n" jsonString += ' "runtime": ' + str(runtime) jsonString += "\n}\n" if file_out is None: print(jsonString) else: out.write(jsonString) if file_out is not None: out.close() #print("runtime: %s seconds " % (runtime)) else: raise (OSError('Could not find a model.')) def save(self, dir_path='data/models/sequenceLabelling/'): # create subfolder for the model if not already exists directory = os.path.join(dir_path, self.model_config.model_name) if not os.path.exists(directory): os.makedirs(directory) self.p.save(os.path.join(directory, self.preprocessor_file)) print('preprocessor saved') self.model_config.save(os.path.join(directory, self.config_file)) print('model config file saved') self.model.save(os.path.join(directory, self.weight_file)) print('model saved') def load(self, dir_path='data/models/sequenceLabelling/'): self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file)) # load embeddings self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))
class Sequence(object): config_file = 'config.json' weight_file = 'model_weights.hdf5' preprocessor_file = 'preprocessor.json' #preprocessor_file_new = 'preprocessor.json' # number of parallel worker for the data generator when not using ELMo nb_workers = 6 def __init__(self, model_name, model_type="BidLSTM_CRF", embeddings_name=None, char_emb_size=25, max_char_length=30, char_lstm_units=25, word_lstm_units=100, max_sequence_length=300, dropout=0.5, recurrent_dropout=0.25, use_char_feature=True, use_crf=True, batch_size=20, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, early_stop=True, patience=5, max_checkpoints_to_keep=5, log_dir=None, use_ELMo=False, use_BERT=False, fold_number=1, multiprocessing=True, features_indices=None): self.model = None self.models = None self.p = None self.log_dir = log_dir self.embeddings_name = embeddings_name word_emb_size = 0 if embeddings_name is not None: self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT) word_emb_size = self.embeddings.embed_size else: self.embeddings = None self.model_config = ModelConfig( model_name=model_name, model_type=model_type, embeddings_name=embeddings_name, word_embedding_size=word_emb_size, char_emb_size=char_emb_size, char_lstm_units=char_lstm_units, max_char_length=max_char_length, word_lstm_units=word_lstm_units, max_sequence_length=max_sequence_length, dropout=dropout, recurrent_dropout=recurrent_dropout, use_char_feature=use_char_feature, use_crf=use_crf, fold_number=fold_number, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT, features_indices=features_indices) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, early_stop, patience, max_checkpoints_to_keep, multiprocessing) def train(self, x_train, y_train, f_train: np.array = None, x_valid=None, y_valid=None, f_valid: np.array = None, callbacks=None): # TBD if valid is None, segment train to get one x_all = np.concatenate( (x_train, x_valid), axis=0) if x_valid is not None else x_train y_all = np.concatenate( (y_train, y_valid), axis=0) if y_valid is not None else y_train features_all = concatenate_or_none((f_train, f_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) if self.p.return_features is not False: print('x_train.shape: ', x_train.shape) print('features_train.shape: ', f_train.shape) sample_transformed_features = self.p.transform_features(f_train) self.model_config.max_feature_size = np.asarray( sample_transformed_features).shape[-1] print('max_feature_size: ', self.model_config.max_feature_size) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p) trainer.train(x_train, y_train, x_valid, y_valid, features_train=f_train, features_valid=f_valid, callbacks=callbacks) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache() def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train: np.array = None, f_valid: np.array = None, fold_number=10, callbacks=None): x_all = np.concatenate( (x_train, x_valid), axis=0) if x_valid is not None else x_train y_all = np.concatenate( (y_train, y_valid), axis=0) if y_valid is not None else y_train features_all = concatenate_or_none((f_train, f_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.p.return_lengths = True if 'bert' in self.model_config.model_type.lower(): self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) self.models = [] for k in range(0, fold_number): model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) self.models.append(model) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p) trainer.train_nfold(x_train, y_train, x_valid, y_valid, f_train=f_train, f_valid=f_valid, callbacks=callbacks) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache() if 'bert' in self.model_config.model_type.lower(): self.save() def eval(self, x_test, y_test, features=None): if self.models and 1 < self.model_config.fold_number == len( self.models): self.eval_nfold(x_test, y_test, features=features) else: self.eval_single(x_test, y_test, features=features) def eval_single(self, x_test, y_test, features=None): if 'bert' not in self.model_config.model_type.lower(): if self.model: # Prepare test data(steps, generator) test_generator = DataGenerator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.model scorer.on_epoch_end(epoch=-1) else: raise (OSError('Could not find a model.')) else: # BERT architecture model y_pred = self.model.predict(x_test, fold_id=-1) nb_alignment_issues = 0 for i in range(len(y_test)): if len(y_test[i]) != len(y_pred[i]): nb_alignment_issues += 1 # BERT tokenizer appears to introduce some additional tokens without ## prefix, # but this is normally handled when predicting. # To be very conservative, the following ensure the number of tokens always # match, but it should never be used in practice. if len(y_test[i]) < len(y_pred[i]): y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) - len(y_test[i])) if len(y_test[i]) > len(y_pred[i]): y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) - len(y_pred[i])) if nb_alignment_issues > 0: print("number of alignment issues with test set:", nb_alignment_issues) report, report_as_map = classification_report(y_test, y_pred, digits=4) print(report) def eval_nfold(self, x_test, y_test, features=None): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] reports_as_map = [] total_precision = 0 total_recall = 0 for i in range(self.model_config.fold_number): print('\n------------------------ fold ' + str(i) + ' --------------------------------------') if 'bert' not in self.model_config.model_type.lower(): # Prepare test data(steps, generator) test_generator = DataGenerator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config. max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.models[i] scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) reports_as_map.append(scorer.report_as_map) else: # BERT architecture model dir_path = 'data/models/sequenceLabelling/' self.model_config = ModelConfig.load( os.path.join(dir_path, self.model_config.model_name, self.config_file)) self.p = WordPreprocessor.load( os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load_model(i) y_pred = self.model.predict(x_test, fold_id=i) nb_alignment_issues = 0 for j in range(len(y_test)): if len(y_test[i]) != len(y_pred[j]): nb_alignment_issues += 1 # BERT tokenizer appears to introduce some additional tokens without ## prefix, # but this is normally handled when predicting. # To be very conservative, the following ensure the number of tokens always # match, but it should never be used in practice. if len(y_test[j]) < len(y_pred[j]): y_test[j] = y_test[j] + ["O"] * ( len(y_pred[j]) - len(y_test[j])) if len(y_test[j]) > len(y_pred[j]): y_pred[j] = y_pred[j] + ["O"] * ( len(y_test[j]) - len(y_pred[j])) if nb_alignment_issues > 0: print("number of alignment issues with test set:", nb_alignment_issues) f1 = f1_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) print("\tf1: {:04.2f}".format(f1 * 100)) print("\tprecision: {:04.2f}".format(precision * 100)) print("\trecall: {:04.2f}".format(recall * 100)) report, report_as_map = classification_report(y_test, y_pred, digits=4) reports.append(report) reports_as_map.append(report_as_map) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}} micro_f1 = total_f1 / self.model_config.fold_number micro_precision = total_precision / self.model_config.fold_number micro_recall = total_recall / self.model_config.fold_number micro_eval_block = { 'f1': micro_f1, 'precision': micro_precision, 'recall': micro_recall } fold_average_evaluation['micro'] = micro_eval_block # field-level average over the n folds labels = [] for label in sorted(self.p.vocab_tag): if label == 'O' or label == '<PAD>': continue if label.startswith("B-") or label.startswith( "S-") or label.startswith("I-") or label.startswith( "E-"): label = label[2:] if label in labels: continue labels.append(label) sum_p = 0 sum_r = 0 sum_f1 = 0 sum_support = 0 for j in range(0, self.model_config.fold_number): if not label in reports_as_map[j]['labels']: continue report_as_map = reports_as_map[j]['labels'][label] sum_p += report_as_map["precision"] sum_r += report_as_map["recall"] sum_f1 += report_as_map["f1"] sum_support += report_as_map["support"] avg_p = sum_p / self.model_config.fold_number avg_r = sum_r / self.model_config.fold_number avg_f1 = sum_f1 / self.model_config.fold_number avg_support = sum_support / self.model_config.fold_number avg_support_dec = str(avg_support - int(avg_support))[1:] if avg_support_dec != '0': avg_support = math.floor(avg_support) block_label = { 'precision': avg_p, 'recall': avg_r, 'support': avg_support, 'f1': avg_f1 } fold_average_evaluation['labels'][label] = block_label print( "----------------------------------------------------------------------" ) print("\n** Worst ** model scores - run", str(worst_index)) print(reports[worst_index]) print("\n** Best ** model scores - run", str(best_index)) print(reports[best_index]) if 'bert' not in self.model_config.model_type.lower(): self.model = self.models[best_index] else: # copy best BERT model fold_number best_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name + str( best_index) new_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name # update new_model_dir if it already exists, keep its existing config content merge_folders(best_model_dir, new_model_dir) # clean other fold directory for i in range(self.model_config.fold_number): shutil.rmtree('data/models/sequenceLabelling/' + self.model_config.model_name + str(i)) print( "----------------------------------------------------------------------" ) print("\nAverage over", self.model_config.fold_number, "folds") print( get_report(fold_average_evaluation, digits=4, include_avgs=['micro'])) def tag(self, texts, output_format, features=None): # annotate a list of sentences, return the list of annotations in the # specified output_format if self.model: tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p) start_time = time.time() annotations = tagger.tag(texts, output_format, features=features) runtime = round(time.time() - start_time, 3) if output_format is 'json': annotations["runtime"] = runtime #else: # print("runtime: %s seconds " % (runtime)) return annotations else: raise (OSError('Could not find a model.' + str(self.model))) def tag_file(self, file_in, output_format, file_out): # Annotate a text file containing one sentence per line, the annotations are # written in the output file if not None, in the standard output otherwise. # Processing is streamed by batches so that we can process huge files without # memory issues if self.model: tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p) start_time = time.time() if file_out is not None: out = open(file_out, 'w') first = True with open(file_in, 'r') as f: texts = None while texts is None or len( texts ) == self.model_config.batch_size * self.nb_workers: texts = next_n_lines( f, self.model_config.batch_size * self.nb_workers) annotations = tagger.tag(texts, output_format) # if the following is true, we just output the JSON returned by the tagger without any modification directDump = False if first: first = False if len( texts ) < self.model_config.batch_size * self.nb_workers: runtime = round(time.time() - start_time, 3) annotations['runtime'] = runtime jsonString = json.dumps(annotations, sort_keys=False, indent=4, ensure_ascii=False) if file_out is None: print(jsonString) else: out.write(jsonString) directDump = True else: # we need to modify a bit the JSON outputted by the tagger to glue the different batches # output the general information attributes jsonString = '{\n "software": ' + json.dumps( annotations["software"], ensure_ascii=False) + ",\n" jsonString += ' "date": ' + json.dumps( annotations["date"], ensure_ascii=False) + ",\n" jsonString += ' "model": ' + json.dumps( annotations["model"], ensure_ascii=False) + ",\n" jsonString += ' "texts": [' if file_out is None: print(jsonString, end='', flush=True) else: out.write(jsonString) first = True for jsonStr in annotations["texts"]: jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False) #jsonString = jsonString.replace('\n', '\n\t\t') jsonString = re.sub('\n', '\n ', jsonString) if file_out is None: if not first: print(',\n ' + jsonString, end='', flush=True) else: first = False print('\n ' + jsonString, end='', flush=True) else: if not first: out.write(',\n ') out.write(jsonString) else: first = False out.write('\n ') out.write(jsonString) else: for jsonStr in annotations["texts"]: jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False) jsonString = re.sub('\n', '\n ', jsonString) if file_out is None: print(',\n ' + jsonString, end='', flush=True) else: out.write(',\n ') out.write(jsonString) runtime = round(time.time() - start_time, 3) if not directDump: jsonString = "\n ],\n" jsonString += ' "runtime": ' + str(runtime) jsonString += "\n}\n" if file_out is None: print(jsonString) else: out.write(jsonString) if file_out is not None: out.close() #print("runtime: %s seconds " % (runtime)) else: raise (OSError('Could not find a model.')) def save(self, dir_path='data/models/sequenceLabelling/'): # create subfolder for the model if not already exists directory = os.path.join(dir_path, self.model_config.model_name) if not os.path.exists(directory): os.makedirs(directory) self.model_config.save(os.path.join(directory, self.config_file)) print('model config file saved') self.p.save(os.path.join(directory, self.preprocessor_file)) print('preprocessor saved') # bert model are always saved via training process steps as checkpoint if self.model_config.model_type.lower().find("bert") == -1: if self.model is None and self.model_config.fold_number != 0 and self.model_config.fold_number != 1: print( 'Error: model not saved. Evaluation need to be called first to select the best fold model to be saved' ) else: self.model.save(os.path.join(directory, self.weight_file)) print('model saved') def load(self, dir_path='data/models/sequenceLabelling/'): self.model_config = ModelConfig.load( os.path.join(dir_path, self.model_config.model_name, self.config_file)) self.p = WordPreprocessor.load( os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) if self.model_config.model_type.lower().find("bert") != -1: self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), dir_path=dir_path) self.model.load_model() return # load embeddings # Do not use cache in 'production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo, use_BERT=self.model_config.use_BERT, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, self.weight_file))
class Classifier(object): config_file = 'config.json' weight_file = 'model_weights.hdf5' def __init__(self, model_name="", model_type="gru", embeddings_name=None, list_classes=[], char_emb_size=25, dropout=0.5, recurrent_dropout=0.25, use_char_feature=False, batch_size=256, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, patience=5, log_dir=None, maxlen=300, fold_number=1, use_roc_auc=True, use_ELMo=False, use_BERT=False, embeddings=(), class_weights=None, multiprocessing=True): self.model = None self.models = None self.log_dir = log_dir self.embeddings_name = embeddings_name word_emb_size = 0 if embeddings_name is not None and model_type.find("bert") == -1: self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT) word_emb_size = self.embeddings.embed_size self.model_config = ModelConfig(model_name=model_name, model_type=model_type, embeddings_name=embeddings_name, list_classes=list_classes, char_emb_size=char_emb_size, word_emb_size=word_emb_size, dropout=dropout, recurrent_dropout=recurrent_dropout, use_char_feature=use_char_feature, maxlen=maxlen, fold_number=fold_number, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, patience, use_roc_auc, class_weights=class_weights, multiprocessing=multiprocessing) def train(self, x_train, y_train, vocab_init=None, callbacks=None): self.model = getModel(self.model_config, self.training_config) # bert models if self.model_config.model_type.find("bert") != -1: self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes, x_train=x_train, y_train=y_train) self.model.train() return # create validation set in case we don't use k-folds xtr, val_x, y, val_y = train_test_split(x_train, y_train, test_size=0.1) training_generator = DataGenerator(xtr, y, batch_size=self.training_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=True) validation_generator = DataGenerator(val_x, None, batch_size=self.training_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) # uncomment to plot graph #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.model_type+'.png') self.model, best_roc_auc = train_model(self.model, self.model_config.list_classes, self.training_config.batch_size, self.training_config.max_epoch, self.training_config.use_roc_auc, self.training_config.class_weights, training_generator, validation_generator, val_y, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, multiprocessing=self.training_config.multiprocessing, callbacks=callbacks) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache() def train_nfold(self, x_train, y_train, vocab_init=None, callbacks=None): # bert models if self.model_config.model_type.find("bert") != -1: self.model = getModel(self.model_config, self.training_config) self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes, x_train=x_train, y_train=y_train) self.model.train() return self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, callbacks=callbacks) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache() # classification def predict(self, texts, output_format='json', use_main_thread_only=False): if self.model_config.fold_number is 1: if self.model is not None: # bert model? if self.model_config.model_type.find("bert") != -1: # be sure the input processor is instanciated self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes) result = self.model.predict(texts) else: predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) result = predict(self.model, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only) else: raise (OSError('Could not find a model.')) else: # bert model? if self.model_config.model_type.find("bert") != -1: # we don't support n classifiers for BERT for prediction currently # (it would be too large and too slow if loaded 10 times from file for each batch) # (however it is done for eval, models are loaded 1 time for the complete dataset, not each time per batch, and we should do the same here) # be sure the input processor is instanciated self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes) #result = self.models[0].predict(texts) result = self.model.predict(texts) else: if self.models is not None: predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) result = predict_folds(self.models, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only) else: raise (OSError('Could not find nfolds models.')) if output_format is 'json': res = { "software": "DeLFT", "date": datetime.datetime.now().isoformat(), "model": self.model_config.model_name, "classifications": [] } i = 0 for text in texts: classification = { "text": text } the_res = result[i] j = 0 for cl in self.model_config.list_classes: classification[cl] = float(the_res[j]) j += 1 res["classifications"].append(classification) i += 1 return res else: return result def eval(self, x_test, y_test, use_main_thread_only=False): if self.model_config.fold_number == 1: if self.model is not None: # bert model? if self.model_config.model_type.find("bert") != -1: #self.model.eval(x_test, y_test) result = self.model.predict(x_test) else: test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) result = predict(self.model, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only) else: raise (OSError('Could not find a model.')) else: if self.models is not None or (self.model_config.model_type.find("bert") != -1 and self.model is not None): # bert model? print(self.model_config.model_type) if self.model_config.model_type.find("bert") != -1: result_list = [] for i in range(self.model_config.fold_number): result = self.model.predict(x_test, i) result_list.append(result) result = np.ones(result_list[0].shape) for fold_result in result_list: result *= fold_result result **= (1. / len(result_list)) else: test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) result = predict_folds(self.models, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only) else: raise (OSError('Could not find nfolds models.')) print("-----------------------------------------------") print("\nEvaluation on", x_test.shape[0], "instances:") total_accuracy = 0.0 total_f1 = 0.0 total_loss = 0.0 total_roc_auc = 0.0 ''' def normer(t): if t < 0.5: return 0 else: return 1 vfunc = np.vectorize(normer) result_binary = vfunc(result) ''' result_intermediate = np.asarray([np.argmax(line) for line in result]) def vectorize(index, size): result = np.zeros(size) if index < size: result[index] = 1 return result result_binary = np.array([vectorize(xi, len(self.model_config.list_classes)) for xi in result_intermediate]) precision, recall, fscore, support = precision_recall_fscore_support(y_test, result_binary, average=None) print('{:>14} {:>12} {:>12} {:>12} {:>12}'.format(" ", "precision", "recall", "f-score", "support")) p = 0 for the_class in self.model_config.list_classes: the_class = the_class[:14] print('{:>14} {:>12} {:>12} {:>12} {:>12}'.format(the_class, "{:10.4f}" .format(precision[p]), "{:10.4f}".format(recall[p]), "{:10.4f}".format(fscore[p]), support[p])) p += 1 # macro-average (average of class scores) # we distinguish 1-class and multiclass problems if len(self.model_config.list_classes) is 1: total_accuracy = accuracy_score(y_test, result_binary) total_f1 = f1_score(y_test, result_binary) total_loss = log_loss(y_test, result, labels=[0,1]) if len(np.unique(y_test)) == 1: # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss) total_roc_auc = r2_score(y_test, result) if total_roc_auc < 0: total_roc_auc = 0 else: total_roc_auc = roc_auc_score(y_test, result) else: for j in range(0, len(self.model_config.list_classes)): accuracy = accuracy_score(y_test[:, j], result_binary[:, j]) total_accuracy += accuracy f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro') total_f1 += f1 loss = log_loss(y_test[:, j], result[:, j], labels=[0,1]) total_loss += loss if len(np.unique(y_test[:, j])) == 1: # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss) roc_auc = r2_score(y_test[:, j], result[:, j]) if roc_auc < 0: roc_auc = 0 else: roc_auc = roc_auc_score(y_test[:, j], result[:, j]) total_roc_auc += roc_auc ''' print("\nClass:", self.model_config.list_classes[j]) print("\taccuracy at 0.5 =", accuracy) print("\tf-1 at 0.5 =", f1) print("\tlog-loss =", loss) print("\troc auc =", roc_auc) ''' total_accuracy /= len(self.model_config.list_classes) total_f1 /= len(self.model_config.list_classes) total_loss /= len(self.model_config.list_classes) total_roc_auc /= len(self.model_config.list_classes) ''' if len(self.model_config.list_classes) is not 1: print("\nMacro-average:") print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy)) print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1)) print("\taverage log-loss =","{:10.4f}".format( total_loss)) print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc)) ''' # micro-average (average of scores for each instance) # make sense only if we have more than 1 class, otherwise same as # macro-avergae if len(self.model_config.list_classes) is not 1: total_accuracy = 0.0 total_f1 = 0.0 total_loss = 0.0 total_roc_auc = 0.0 for i in range(0, result.shape[0]): accuracy = accuracy_score(y_test[i,:], result_binary[i,:]) total_accuracy += accuracy f1 = f1_score(y_test[i,:], result_binary[i,:], average='micro') total_f1 += f1 loss = log_loss(y_test[i,:], result[i,:]) total_loss += loss roc_auc = roc_auc_score(y_test[i,:], result[i,:]) total_roc_auc += roc_auc total_accuracy /= result.shape[0] total_f1 /= result.shape[0] total_loss /= result.shape[0] total_roc_auc /= result.shape[0] ''' print("\nMicro-average:") print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy)) print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1)) print("\taverage log-loss =", "{:10.4f}".format(total_loss)) print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc)) ''' def save(self, dir_path='data/models/textClassification/'): # create subfolder for the model if not already exists directory = os.path.join(dir_path, self.model_config.model_name) if not os.path.exists(directory): os.makedirs(directory) self.model_config.save(os.path.join(directory, self.config_file)) print('model config file saved') # bert model are always saved via training process steps as checkpoint if self.model_config.model_type.find("bert") != -1: print('model saved') return if self.model_config.fold_number is 1: if self.model is not None: self.model.save(os.path.join(directory, self.model_config.model_type+"."+self.weight_file)) print('model saved') else: print('Error: model has not been built') else: if self.models is None: print('Error: nfolds models have not been built') else: for i in range(0, self.model_config.fold_number): self.models[i].save(os.path.join(directory, self.model_config.model_type+".model{0}_weights.hdf5".format(i))) print('nfolds model saved') def load(self, dir_path='data/models/textClassification/'): self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file)) if self.model_config.model_type.find("bert") != -1: self.model = getModel(self.model_config, self.training_config) self.model.load() return # load embeddings # Do not use cache in 'production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo, use_BERT=self.model_config.use_BERT, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = getModel(self.model_config, self.training_config) if self.model_config.fold_number is 1: self.model.load_weights(os.path.join(dir_path, self.model_config.model_name, self.model_config.model_type+"."+self.weight_file)) else: self.models = [] for i in range(0, self.model_config.fold_number): local_model = getModel(self.model_config, self.training_config) local_model.load_weights(os.path.join(dir_path, self.model_config.model_name, self.model_config.model_type+".model{0}_weights.hdf5".format(i))) self.models.append(local_model)
def __init__(self, model_name=None, architecture=None, embeddings_name=None, char_emb_size=25, max_char_length=30, char_lstm_units=25, word_lstm_units=100, max_sequence_length=300, dropout=0.5, recurrent_dropout=0.25, batch_size=20, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, early_stop=True, patience=5, max_checkpoints_to_keep=0, use_ELMo=False, log_dir=None, fold_number=1, multiprocessing=True, features_indices=None, transformer_name: str = None): if model_name is None: # add a dummy name based on the architecture model_name = architecture if embeddings_name is not None: model_name += "_" + embeddings_name if transformer_name is not None: model_name += "_" + transformer_name self.model = None self.models = None self.p: Preprocessor = None self.log_dir = log_dir self.embeddings_name = embeddings_name word_emb_size = 0 self.embeddings = None self.model_local_path = None self.registry = load_resource_registry("delft/resources-registry.json") if self.embeddings_name is not None: self.embeddings = Embeddings(self.embeddings_name, resource_registry=self.registry, use_ELMo=use_ELMo) word_emb_size = self.embeddings.embed_size else: self.embeddings = None word_emb_size = 0 self.model_config = ModelConfig( model_name=model_name, architecture=architecture, embeddings_name=embeddings_name, word_embedding_size=word_emb_size, char_emb_size=char_emb_size, char_lstm_units=char_lstm_units, max_char_length=max_char_length, word_lstm_units=word_lstm_units, max_sequence_length=max_sequence_length, dropout=dropout, recurrent_dropout=recurrent_dropout, fold_number=fold_number, batch_size=batch_size, use_ELMo=use_ELMo, features_indices=features_indices, transformer_name=transformer_name) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, early_stop, patience, max_checkpoints_to_keep, multiprocessing)
class Sequence(object): # number of parallel worker for the data generator nb_workers = 6 def __init__(self, model_name=None, architecture=None, embeddings_name=None, char_emb_size=25, max_char_length=30, char_lstm_units=25, word_lstm_units=100, max_sequence_length=300, dropout=0.5, recurrent_dropout=0.25, batch_size=20, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, early_stop=True, patience=5, max_checkpoints_to_keep=0, use_ELMo=False, log_dir=None, fold_number=1, multiprocessing=True, features_indices=None, transformer_name: str = None): if model_name is None: # add a dummy name based on the architecture model_name = architecture if embeddings_name is not None: model_name += "_" + embeddings_name if transformer_name is not None: model_name += "_" + transformer_name self.model = None self.models = None self.p: Preprocessor = None self.log_dir = log_dir self.embeddings_name = embeddings_name word_emb_size = 0 self.embeddings = None self.model_local_path = None self.registry = load_resource_registry("delft/resources-registry.json") if self.embeddings_name is not None: self.embeddings = Embeddings(self.embeddings_name, resource_registry=self.registry, use_ELMo=use_ELMo) word_emb_size = self.embeddings.embed_size else: self.embeddings = None word_emb_size = 0 self.model_config = ModelConfig( model_name=model_name, architecture=architecture, embeddings_name=embeddings_name, word_embedding_size=word_emb_size, char_emb_size=char_emb_size, char_lstm_units=char_lstm_units, max_char_length=max_char_length, word_lstm_units=word_lstm_units, max_sequence_length=max_sequence_length, dropout=dropout, recurrent_dropout=recurrent_dropout, fold_number=fold_number, batch_size=batch_size, use_ELMo=use_ELMo, features_indices=features_indices, transformer_name=transformer_name) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, early_stop, patience, max_checkpoints_to_keep, multiprocessing) def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, callbacks=None): # TBD if valid is None, segment train to get one if early_stop is True # we concatenate all the training+validation data to create the model vocabulary if not x_valid is None: x_all = np.concatenate((x_train, x_valid), axis=0) else: x_all = x_train if not y_valid is None: y_all = np.concatenate((y_train, y_valid), axis=0) else: y_all = y_train features_all = concatenate_or_none((f_train, f_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag), load_pretrained_weights=True) print_parameters(self.model_config, self.training_config) self.model.print_summary() # uncomment to plot graph #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png') trainer = Trainer( self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p, transformer_preprocessor=self.model.transformer_preprocessor) trainer.train(x_train, y_train, x_valid, y_valid, features_train=f_train, features_valid=f_valid, callbacks=callbacks) if self.embeddings and self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, callbacks=None): x_all = np.concatenate( (x_train, x_valid), axis=0) if x_valid is not None else x_train y_all = np.concatenate( (y_train, y_valid), axis=0) if y_valid is not None else y_train features_all = concatenate_or_none((f_train, f_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.models = [] trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p) trainer.train_nfold(x_train, y_train, x_valid, y_valid, f_train=f_train, f_valid=f_valid, callbacks=callbacks) if self.embeddings and self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() def eval(self, x_test, y_test, features=None): if self.model_config.fold_number > 1: self.eval_nfold(x_test, y_test, features=features) else: self.eval_single(x_test, y_test, features=features) def eval_single(self, x_test, y_test, features=None): if self.model is None: raise (OSError('Could not find a model.')) print_parameters(self.model_config, self.training_config) self.model.print_summary() if self.model_config.transformer_name is None: # we can use a data generator for evaluation # Prepare test data(steps, generator) generator = self.model.get_generator() test_generator = generator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True, use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) scorer.model = self.model scorer.on_epoch_end(epoch=-1) else: # the architecture model uses a transformer layer # note that we could also use the above test_generator, but as an alternative here we check the # test/prediction alignment of tokens and the validity of the maximum sequence input length # wrt the length of the test sequences tagger = Tagger( self.model, self.model_config, self.embeddings, preprocessor=self.p, transformer_preprocessor=self.model.transformer_preprocessor) y_pred_pairs = tagger.tag(x_test, output_format=None, features=features) # keep only labels y_pred = [] for result in y_pred_pairs: result_labels = [] for pair in result: result_labels.append(pair[1]) y_pred.append(result_labels) nb_alignment_issues = 0 for i in range(len(y_test)): if len(y_test[i]) != len(y_pred[i]): #print("y_test:", y_test[i]) #print("y_pred:", y_pred[i]) nb_alignment_issues += 1 # BERT tokenizer appears to introduce some additional tokens without ## prefix, # but we normally handled that well when predicting. # To be very conservative, the following ensure the number of tokens always # match, but it should never be used in practice. if len(y_test[i]) < len(y_pred[i]): y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) - len(y_test[i])) if len(y_test[i]) > len(y_pred[i]): y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) - len(y_pred[i])) if nb_alignment_issues > 0: print("number of alignment issues with test set:", nb_alignment_issues) print( "to solve them consider increasing the maximum sequence input length of the model and retrain" ) report, report_as_map = classification_report(y_test, y_pred, digits=4) print(report) def eval_nfold(self, x_test, y_test, features=None): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] reports_as_map = [] total_precision = 0 total_recall = 0 for i in range(self.model_config.fold_number): if self.model_config.transformer_name is None: the_model = self.models[i] bert_preprocessor = None else: # the architecture model uses a transformer layer, it is large and needs to be loaded from disk dir_path = 'data/models/sequenceLabelling/' weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(i) + ".hdf5") self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), load_pretrained_weights=False, local_path=os.path.join( dir_path, self.model_config.model_name)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) the_model = self.model bert_preprocessor = self.model.transformer_preprocessor if i == 0: the_model.print_summary() print_parameters(self.model_config, self.training_config) print('\n------------------------ fold ' + str(i) + ' --------------------------------------') # we can use a data generator for evaluation # Prepare test data(steps, generator) generator = the_model.get_generator() test_generator = generator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, bert_preprocessor=bert_preprocessor, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True, use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) scorer.model = the_model scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) reports_as_map.append(scorer.report_as_map) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}} micro_f1 = total_f1 / self.model_config.fold_number micro_precision = total_precision / self.model_config.fold_number micro_recall = total_recall / self.model_config.fold_number micro_eval_block = { 'f1': micro_f1, 'precision': micro_precision, 'recall': micro_recall } fold_average_evaluation['micro'] = micro_eval_block # field-level average over the n folds labels = [] for label in sorted(self.p.vocab_tag): if label == 'O' or label == '<PAD>': continue if label.startswith("B-") or label.startswith( "S-") or label.startswith("I-") or label.startswith( "E-"): label = label[2:] if label in labels: continue labels.append(label) sum_p = 0 sum_r = 0 sum_f1 = 0 sum_support = 0 for j in range(0, self.model_config.fold_number): if label not in reports_as_map[j]['labels']: continue report_as_map = reports_as_map[j]['labels'][label] sum_p += report_as_map["precision"] sum_r += report_as_map["recall"] sum_f1 += report_as_map["f1"] sum_support += report_as_map["support"] avg_p = sum_p / self.model_config.fold_number avg_r = sum_r / self.model_config.fold_number avg_f1 = sum_f1 / self.model_config.fold_number avg_support = sum_support / self.model_config.fold_number avg_support_dec = str(avg_support - int(avg_support))[1:] if avg_support_dec != '0': avg_support = math.floor(avg_support) block_label = { 'precision': avg_p, 'recall': avg_r, 'support': avg_support, 'f1': avg_f1 } fold_average_evaluation['labels'][label] = block_label print( "----------------------------------------------------------------------" ) print("\n** Worst ** model scores - run", str(worst_index)) print(reports[worst_index]) print("\n** Best ** model scores - run", str(best_index)) print(reports[best_index]) fold_nb = self.model_config.fold_number self.model_config.fold_number = 1 if self.model_config.transformer_name is None: self.model = self.models[best_index] else: dir_path = 'data/models/sequenceLabelling/' weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(best_index) + ".hdf5") # saved config file must be updated to single fold self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) print( "----------------------------------------------------------------------" ) print("\nAverage over", str(int(fold_nb)), "folds") print( get_report(fold_average_evaluation, digits=4, include_avgs=['micro'])) def tag(self, texts, output_format, features=None, batch_size=None): # annotate a list of sentences, return the list of annotations in the # specified output_format if batch_size != None: self.model_config.batch_size = batch_size print("---") print("batch_size (prediction):", self.model_config.batch_size) print("---") if self.model: tagger = Tagger( self.model, self.model_config, self.embeddings, preprocessor=self.p, transformer_preprocessor=self.model.transformer_preprocessor) start_time = time.time() annotations = tagger.tag(texts, output_format, features=features) runtime = round(time.time() - start_time, 3) if output_format == 'json': annotations["runtime"] = runtime #else: # print("runtime: %s seconds " % (runtime)) return annotations else: raise (OSError('Could not find a model.' + str(self.model))) def tag_file(self, file_in, output_format, file_out, batch_size=None): # Annotate a text file containing one sentence per line, the annotations are # written in the output file if not None, in the standard output otherwise. # Processing is streamed by batches so that we can process huge files without # memory issues if batch_size != None: self.model_config.batch_size = batch_size print("---") print("batch_size (prediction):", self.model_config.batch_size) print("---") if self.model: tagger = Tagger( self.model, self.model_config, self.embeddings, preprocessor=self.p, transformer_preprocessor=self.model.transformer_preprocessor) start_time = time.time() if file_out != None: out = open(file_out, 'w') first = True with open(file_in, 'r') as f: texts = None while texts == None or len( texts ) == self.model_config.batch_size * self.nb_workers: texts = next_n_lines( f, self.model_config.batch_size * self.nb_workers) annotations = tagger.tag(texts, output_format) # if the following is true, we just output the JSON returned by the tagger without any modification directDump = False if first: first = False if len( texts ) < self.model_config.batch_size * self.nb_workers: runtime = round(time.time() - start_time, 3) annotations['runtime'] = runtime jsonString = json.dumps(annotations, sort_keys=False, indent=4, ensure_ascii=False) if file_out == None: print(jsonString) else: out.write(jsonString) directDump = True else: # we need to modify a bit the JSON outputted by the tagger to glue the different batches # output the general information attributes jsonString = '{\n "software": ' + json.dumps( annotations["software"], ensure_ascii=False) + ",\n" jsonString += ' "date": ' + json.dumps( annotations["date"], ensure_ascii=False) + ",\n" jsonString += ' "model": ' + json.dumps( annotations["model"], ensure_ascii=False) + ",\n" jsonString += ' "texts": [' if file_out == None: print(jsonString, end='', flush=True) else: out.write(jsonString) first = True for jsonStr in annotations["texts"]: jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False) #jsonString = jsonString.replace('\n', '\n\t\t') jsonString = re.sub('\n', '\n ', jsonString) if file_out == None: if not first: print(',\n ' + jsonString, end='', flush=True) else: first = False print('\n ' + jsonString, end='', flush=True) else: if not first: out.write(',\n ') out.write(jsonString) else: first = False out.write('\n ') out.write(jsonString) else: for jsonStr in annotations["texts"]: jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False) jsonString = re.sub('\n', '\n ', jsonString) if file_out == None: print(',\n ' + jsonString, end='', flush=True) else: out.write(',\n ') out.write(jsonString) runtime = round(time.time() - start_time, 3) if not directDump: jsonString = "\n ],\n" jsonString += ' "runtime": ' + str(runtime) jsonString += "\n}\n" if file_out == None: print(jsonString) else: out.write(jsonString) if file_out != None: out.close() #print("runtime: %s seconds " % (runtime)) else: raise (OSError('Could not find a model.')) def save(self, dir_path='data/models/sequenceLabelling/', weight_file=DEFAULT_WEIGHT_FILE_NAME): # create subfolder for the model if not already exists directory = os.path.join(dir_path, self.model_config.model_name) if not os.path.exists(directory): os.makedirs(directory) self.model_config.save(os.path.join(directory, CONFIG_FILE_NAME)) print('model config file saved') self.p.save(os.path.join(directory, PROCESSOR_FILE_NAME)) print('preprocessor saved') if self.model is None and self.model_config.fold_number > 1: print( 'Error: model not saved. Evaluation need to be called first to select the best fold model to be saved' ) else: self.model.save(os.path.join(directory, weight_file)) # save pretrained transformer config if used in the model if self.model.transformer_config is not None: self.model.transformer_config.to_json_file( os.path.join(directory, TRANSFORMER_CONFIG_FILE_NAME)) print('transformer config saved') if self.model.transformer_preprocessor is not None: self.model.transformer_preprocessor.tokenizer.save_pretrained( os.path.join(directory, DEFAULT_TRANSFORMER_TOKENIZER_DIR)) print('transformer tokenizer saved') print('model saved') def load(self, dir_path='data/models/sequenceLabelling/', weight_file=DEFAULT_WEIGHT_FILE_NAME): model_path = os.path.join(dir_path, self.model_config.model_name) self.model_config = ModelConfig.load( os.path.join(model_path, CONFIG_FILE_NAME)) if self.model_config.embeddings_name is not None: # load embeddings # Do not use cache in 'prediction/production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_ELMo=self.model_config.use_ELMo, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size else: self.embeddings = None self.model_config.word_embedding_size = 0 self.p = Preprocessor.load( os.path.join(dir_path, self.model_config.model_name, PROCESSOR_FILE_NAME)) self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), load_pretrained_weights=False, local_path=os.path.join( dir_path, self.model_config.model_name)) print( "load weights from", os.path.join(dir_path, self.model_config.model_name, weight_file)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) self.model.print_summary()
def __init__(self, model_name=None, architecture="gru", embeddings_name=None, list_classes=[], char_emb_size=25, dropout=0.5, recurrent_dropout=0.25, use_char_feature=False, batch_size=256, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, patience=5, log_dir=None, maxlen=300, fold_number=1, use_roc_auc=True, early_stop=True, class_weights=None, multiprocessing=True, transformer_name: str=None): if model_name is None: # add a dummy name based on the architecture model_name = architecture if embeddings_name is not None: model_name += "_" + embeddings_name if transformer_name is not None: model_name += "_" + transformer_name self.model = None self.models = None self.log_dir = log_dir self.embeddings_name = embeddings_name self.embeddings = None # if transformer_name is None, no bert layer is present in the model self.transformer_name = None self.registry = load_resource_registry("delft/resources-registry.json") word_emb_size = 0 if transformer_name is not None: self.transformer_name = transformer_name self.embeddings_name = None self.embeddings = None elif self.embeddings_name is not None: self.embeddings = Embeddings(self.embeddings_name, resource_registry=self.registry) word_emb_size = self.embeddings.embed_size self.model_config = ModelConfig(model_name=model_name, architecture=architecture, embeddings_name=embeddings_name, list_classes=list_classes, char_emb_size=char_emb_size, word_emb_size=word_emb_size, dropout=dropout, recurrent_dropout=recurrent_dropout, use_char_feature=use_char_feature, maxlen=maxlen, fold_number=fold_number, batch_size=batch_size, transformer_name=self.transformer_name) self.training_config = TrainingConfig(batch_size=batch_size, optimizer=optimizer, learning_rate=learning_rate, lr_decay=lr_decay, clip_gradients=clip_gradients, max_epoch=max_epoch, patience=patience, use_roc_auc=use_roc_auc, early_stop=early_stop, class_weights=class_weights, multiprocessing=multiprocessing)
class Classifier(object): config_file = 'config.json' weight_file = 'model_weights.hdf5' def __init__(self, model_name="", model_type="gru", embeddings_name=None, list_classes=[], char_emb_size=25, dropout=0.5, recurrent_dropout=0.25, use_char_feature=False, batch_size=256, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, patience=5, log_dir=None, maxlen=300, fold_number=1, use_roc_auc=True, use_ELMo=False, use_BERT=False, embeddings=(), class_weights=None): self.model = None self.models = None self.log_dir = log_dir self.embeddings_name = embeddings_name word_emb_size = 0 if embeddings_name is not None: self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT) word_emb_size = self.embeddings.embed_size self.model_config = ModelConfig(model_name=model_name, model_type=model_type, embeddings_name=embeddings_name, list_classes=list_classes, char_emb_size=char_emb_size, word_emb_size=word_emb_size, dropout=dropout, recurrent_dropout=recurrent_dropout, use_char_feature=use_char_feature, maxlen=maxlen, fold_number=fold_number, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, patience, use_roc_auc, class_weights=class_weights) def train(self, x_train, y_train, vocab_init=None): # create validation set in case we don't use k-folds xtr, val_x, y, val_y = train_test_split(x_train, y_train, test_size=0.1) training_generator = DataGenerator( xtr, y, batch_size=self.training_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=True) validation_generator = DataGenerator( val_x, None, batch_size=self.training_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) self.model = getModel(self.model_config, self.training_config) # uncomment to plot graph #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.model_type+'.png') self.model, best_roc_auc = train_model( self.model, self.model_config.list_classes, self.training_config.batch_size, self.training_config.max_epoch, self.training_config.use_roc_auc, self.training_config.class_weights, training_generator, validation_generator, val_y, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache() def train_nfold(self, x_train, y_train, vocab_init=None): self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache() # classification def predict(self, texts, output_format='json'): if self.model_config.fold_number is 1: if self.model is not None: predict_generator = DataGenerator( texts, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) result = predict(self.model, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT) else: raise (OSError('Could not find a model.')) else: if self.models is not None: predict_generator = DataGenerator( texts, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) result = predict_folds(self.models, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT) else: raise (OSError('Could not find nfolds models.')) if output_format is 'json': res = { "software": "DeLFT", "date": datetime.datetime.now().isoformat(), "model": self.model_config.model_name, "classifications": [] } i = 0 for text in texts: classification = {"text": text} the_res = result[i] j = 0 for cl in self.model_config.list_classes: classification[cl] = float(the_res[j]) j += 1 res["classifications"].append(classification) i += 1 return res else: return result def eval(self, x_test, y_test): if self.model_config.fold_number is 1: if self.model is not None: test_generator = DataGenerator( x_test, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) result = predict(self.model, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT) else: raise (OSError('Could not find a model.')) else: if self.models is not None: test_generator = DataGenerator( x_test, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False) result = predict_folds(self.models, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT) else: raise (OSError('Could not find nfolds models.')) print("-----------------------------------------------") print("\nEvaluation on", x_test.shape[0], "instances:") total_accuracy = 0.0 total_f1 = 0.0 total_loss = 0.0 total_roc_auc = 0.0 def normer(t): if t < 0.5: return 0 else: return 1 vfunc = np.vectorize(normer) result_binary = vfunc(result) # macro-average (average of class scores) # we distinguish 1-class and multiclass problems if len(self.model_config.list_classes) is 1: total_accuracy = accuracy_score(y_test, result_binary) total_f1 = f1_score(y_test, result_binary) total_loss = log_loss(y_test, result) total_roc_auc = roc_auc_score(y_test, result) else: for j in range(0, len(self.model_config.list_classes)): accuracy = accuracy_score(y_test[:, j], result_binary[:, j]) total_accuracy += accuracy f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro') total_f1 += f1 loss = log_loss(y_test[:, j], result[:, j]) total_loss += loss roc_auc = roc_auc_score(y_test[:, j], result[:, j]) total_roc_auc += roc_auc print("\nClass:", self.model_config.list_classes[j]) print("\taccuracy at 0.5 =", accuracy) print("\tf-1 at 0.5 =", f1) print("\tlog-loss =", loss) print("\troc auc =", roc_auc) total_accuracy /= len(self.model_config.list_classes) total_f1 /= len(self.model_config.list_classes) total_loss /= len(self.model_config.list_classes) total_roc_auc /= len(self.model_config.list_classes) if len(self.model_config.list_classes) is not 1: print("\nMacro-average:") print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy)) print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1)) print("\taverage log-loss =", "{:10.4f}".format(total_loss)) print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc)) # micro-average (average of scores for each instance) # make sense only if we have more than 1 class, otherwise same as # macro-avergae if len(self.model_config.list_classes) is not 1: total_accuracy = 0.0 total_f1 = 0.0 total_loss = 0.0 total_roc_auc = 0.0 for i in range(0, result.shape[0]): #for j in range(0, len(self.model_config.list_classes)): accuracy = accuracy_score(y_test[i, :], result_binary[i, :]) total_accuracy += accuracy f1 = f1_score(y_test[i, :], result_binary[i, :], average='micro') total_f1 += f1 loss = log_loss(y_test[i, :], result[i, :]) total_loss += loss roc_auc = roc_auc_score(y_test[i, :], result[i, :]) total_roc_auc += roc_auc total_accuracy /= result.shape[0] total_f1 /= result.shape[0] total_loss /= result.shape[0] total_roc_auc /= result.shape[0] print("\nMicro-average:") print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy)) print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1)) print("\taverage log-loss =", "{:10.4f}".format(total_loss)) print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc)) def save(self, dir_path='data/models/textClassification/'): # create subfolder for the model if not already exists directory = os.path.join(dir_path, self.model_config.model_name) if not os.path.exists(directory): os.makedirs(directory) self.model_config.save(os.path.join(directory, self.config_file)) print('model config file saved') if self.model_config.fold_number is 1: if self.model is not None: self.model.save( os.path.join( directory, self.model_config.model_type + "." + self.weight_file)) print('model saved') else: print('Error: model has not been built') else: if self.models is None: print('Error: nfolds models have not been built') else: for i in range(0, self.model_config.fold_number): self.models[i].save( os.path.join( directory, self.model_config.model_type + ".model{0}_weights.hdf5".format(i))) print('nfolds model saved') def load(self, dir_path='data/models/textClassification/'): self.model_config = ModelConfig.load( os.path.join(dir_path, self.model_config.model_name, self.config_file)) # load embeddings self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo, use_BERT=self.model_config.use_BERT) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = getModel(self.model_config, self.training_config) if self.model_config.fold_number is 1: self.model.load_weights( os.path.join( dir_path, self.model_config.model_name, self.model_config.model_type + "." + self.weight_file)) else: self.models = [] for i in range(0, self.model_config.fold_number): local_model = getModel(self.model_config, self.training_config) local_model.load_weights( os.path.join( dir_path, self.model_config.model_name, self.model_config.model_type + ".model{0}_weights.hdf5".format(i))) self.models.append(local_model)