def __init__(self, model_archived_file: str, cuda_device: str = "cpu"): tar = tarfile.open(model_archived_file) tar.extractall() folder_name = tar.getnames()[0] tar.close() f = open(folder_name + "/config.conf", 'rb') self.conf = pickle.load( f) # variables come out in the order you put them in # default batch size for conf is `10` f.close() device = torch.device(cuda_device) self.conf.device = device self.model = NNCRF(self.conf, print_info=False) self.model.load_state_dict( torch.load(folder_name + "/lstm_crf.m", map_location=device)) self.model.eval() if self.conf.context_emb != ContextEmb.none: if cuda_device == "cpu": cuda_device = -1 else: cuda_device = int(cuda_device.split(":")[1]) self.elmo = load_elmo(cuda_device)
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str, insts: List[Instance]): ## evaluation metrics = np.asarray([0, 0, 0], dtype=int) batch_id = 0 batch_size = config.batch_size for batch in batch_insts_ids: one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] sorted_batch_insts = sorted(one_batch_insts, key=lambda inst: len(inst.input.words), reverse=True) batch_max_scores, batch_max_ids = model.decode(batch) metrics += evaluate_num(sorted_batch_insts, batch_max_ids, batch[-1], batch[1], config.idx2labels) batch_id += 1 p, total_predict, total_entity = metrics[0], metrics[1], metrics[2] precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0 recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0 fscore = 2.0 * precision * recall / ( precision + recall) if precision != 0 or recall != 0 else 0 print("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (name, precision, recall, fscore), flush=True) return [precision, recall, fscore]
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str, insts: List[Instance], print_each_type_metric: bool = False): ## evaluation p_dict, total_predict_dict, total_entity_dict = Counter(), Counter( ), Counter() batch_id = 0 batch_size = config.batch_size with torch.no_grad(): for batch in batch_insts_ids: one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = model.decode(**batch) batch_p, batch_predict, batch_total = evaluate_batch_insts( one_batch_insts, batch_max_ids, batch["labels"], batch["word_seq_lens"], config.idx2labels) p_dict += batch_p total_predict_dict += batch_predict total_entity_dict += batch_total batch_id += 1 if print_each_type_metric: for key in total_entity_dict: precision_key, recall_key, fscore_key = get_metric( p_dict[key], total_entity_dict[key], total_predict_dict[key]) print( f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}" ) total_p = sum(list(p_dict.values())) total_predict = sum(list(total_predict_dict.values())) total_entity = sum(list(total_entity_dict.values())) precision, recall, fscore = get_metric(total_p, total_entity, total_predict) print(colored( f"[{name} set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, F1: {fscore:.2f}", 'blue'), flush=True) return [precision, recall, fscore]
def test_model(config: Config, test_insts): model_name = "model_files/lstm_{}_crf_{}_{}_dep_{}_elmo_{}_lr_{}.m".format( config.hidden_dim, config.dataset, config.train_num, config.context_emb.name, config.optimizer.lower(), config.learning_rate) res_name = "results/lstm_{}_crf_{}_{}_dep_{}_elmo_{}_lr_{}.results".format( config.hidden_dim, config.dataset, config.train_num, config.context_emb.name, config.optimizer.lower(), config.learning_rate) model = NNCRF(config) model.load_state_dict(torch.load(model_name)) model.eval() test_batches = batching_list_instances(config, test_insts) evaluate_model(config, model, test_batches, "test", test_insts) write_results(res_name, test_insts)
class NERPredictor: """ Usage: sentence = "This is a sentence" model_path = "model_files.tar.gz" model = Predictor(model_path) prediction = model.predict(sentence) """ def __init__(self, model_archived_file: str, cuda_device: str = "cpu"): tar = tarfile.open(model_archived_file) tar.extractall() folder_name = tar.getnames()[0] tar.close() f = open(folder_name + "/config.conf", 'rb') self.conf = pickle.load( f) # variables come out in the order you put them in # default batch size for conf is `10` f.close() device = torch.device(cuda_device) self.conf.device = device self.model = NNCRF(self.conf, print_info=False) self.model.load_state_dict( torch.load(folder_name + "/lstm_crf.m", map_location=device)) self.model.eval() if self.conf.context_emb != ContextEmb.none: if cuda_device == "cpu": cuda_device = -1 else: cuda_device = int(cuda_device.split(":")[1]) self.elmo = load_elmo(cuda_device) def predict_insts(self, batch_insts_ids: Tuple) -> List[List[str]]: batch_max_scores, batch_max_ids = self.model.decode(batch_insts_ids) predictions = [] for idx in range(len(batch_max_ids)): length = batch_insts_ids[1][idx] prediction = batch_max_ids[idx][:length].tolist() prediction = prediction[::-1] prediction = [self.conf.idx2labels[l] for l in prediction] predictions.append(prediction) return predictions def sent_to_insts(self, sentence: str) -> List[Instance]: words = sentence.split() return [Instance(Sentence(words))] def sents_to_insts(self, sentences: List[str]) -> List[Instance]: insts = [] for sentence in sentences: words = sentence.split() insts.append(Instance(Sentence(words))) return insts def create_batch_data(self, insts: List[Instance]): return simple_batching(self.conf, insts) def predict(self, sentences: Union[str, List[str]]): sents = [sentences] if isinstance(sentences, str) else sentences insts = self.sents_to_insts(sents) self.conf.map_insts_ids(insts) if self.conf.context_emb != ContextEmb.none: read_parse_write(self.elmo, insts) test_batches = self.create_batch_data(insts) predictions = self.predict_insts(test_batches) if len(predictions) == 1: return predictions[0] else: return predictions
def train_model(config: Config, epoch: int, train_insts: List[Instance], dev_insts: List[Instance], test_insts: List[Instance]): ### Data Processing Info train_num = len(train_insts) print("number of instances: %d" % (train_num)) print(colored("[Shuffled] Shuffle the training instance ids", "red")) random.shuffle(train_insts) batched_data = batching_list_instances(config, train_insts) dev_batches = batching_list_instances(config, dev_insts) test_batches = batching_list_instances(config, test_insts) if config.embedder_type == "normal": model = NNCRF(config) optimizer = get_optimizer(config, model) scheduler = None else: print( colored( f"[Model Info]: Working with transformers package from huggingface with {config.embedder_type}", 'red')) print( colored( f"[Optimizer Info]: You should be aware that you are using the optimizer from huggingface.", 'red')) print( colored( f"[Optimizer Info]: Change the optimier in transformers_util.py if you want to make some modifications.", 'red')) model = TransformersCRF(config) optimizer, scheduler = get_huggingface_optimizer_and_scheduler( config, model, num_training_steps=len(batched_data) * epoch, weight_decay=0.0, eps=1e-8, warmup_step=0) print( colored(f"[Optimizer Info] Modify the optimizer info as you need.", 'red')) print(optimizer) model.to(config.device) best_dev = [-1, 0] best_test = [-1, 0] model_folder = config.model_folder res_folder = "results" if os.path.exists("model_files/" + model_folder): raise FileExistsError( f"The folder model_files/{model_folder} exists. Please either delete it or create a new one " f"to avoid override.") model_path = f"model_files/{model_folder}/lstm_crf.m" config_path = f"model_files/{model_folder}/config.conf" res_path = f"{res_folder}/{model_folder}.results" print("[Info] The model will be saved to: %s.tar.gz" % (model_folder)) os.makedirs(f"model_files/{model_folder}", exist_ok=True) ## create model files. not raise error if exist os.makedirs(res_folder, exist_ok=True) no_incre_dev = 0 print( colored( f"[Train Info] Start training, you have set to stop if performace not increase for {config.max_no_incre} epochs", 'red')) for i in tqdm(range(1, epoch + 1), desc="Epoch"): epoch_loss = 0 start_time = time.time() model.zero_grad() if config.optimizer.lower() == "sgd": optimizer = lr_decay(config, optimizer, i) for index in tqdm(np.random.permutation(len(batched_data)), desc="--training batch", total=len(batched_data)): model.train() loss = model(**batched_data[index]) epoch_loss += loss.item() loss.backward() if config.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) optimizer.step() optimizer.zero_grad() model.zero_grad() if scheduler is not None: scheduler.step() end_time = time.time() print("Epoch %d: %.5f, Time is %.2fs" % (i, epoch_loss, end_time - start_time), flush=True) model.eval() dev_metrics = evaluate_model(config, model, dev_batches, "dev", dev_insts) test_metrics = evaluate_model(config, model, test_batches, "test", test_insts) if dev_metrics[2] > best_dev[0]: print("saving the best model...") no_incre_dev = 0 best_dev[0] = dev_metrics[2] best_dev[1] = i best_test[0] = test_metrics[2] best_test[1] = i torch.save(model.state_dict(), model_path) # Save the corresponding config as well. f = open(config_path, 'wb') pickle.dump(config, f) f.close() write_results(res_path, test_insts) else: no_incre_dev += 1 model.zero_grad() if no_incre_dev >= config.max_no_incre: print( "early stop because there are %d epochs not increasing f1 on dev" % no_incre_dev) break print("Archiving the best Model...") with tarfile.open(f"model_files/{model_folder}/{model_folder}.tar.gz", "w:gz") as tar: tar.add(f"model_files/{model_folder}", arcname=os.path.basename(model_folder)) print("Finished archiving the models") print("The best dev: %.2f" % (best_dev[0])) print("The corresponding test: %.2f" % (best_test[0])) print("Final testing.") model.load_state_dict(torch.load(model_path)) model.eval() evaluate_model(config, model, test_batches, "test", test_insts) write_results(res_path, test_insts)
def learn_from_insts(config: Config, epoch: int, train_insts, dev_insts, test_insts): # train_insts: List[Instance], dev_insts: List[Instance], test_insts: List[Instance], batch_size: int = 1 model = NNCRF(config) optimizer = get_optimizer(config, model) train_num = len(train_insts) print("number of instances: %d" % (train_num)) print(colored("[Shuffled] Shuffle the training instance ids", "red")) random.shuffle(train_insts) batched_data = batching_list_instances(config, train_insts) dev_batches = batching_list_instances(config, dev_insts) test_batches = batching_list_instances(config, test_insts) best_dev = [-1, 0] best_test = [-1, 0] model_folder = "model_files" res_folder = "results" model_name = model_folder + "/lstm_{}_crf_{}_{}_dep_{}_elmo_{}_lr_{}.m".format( config.hidden_dim, config.dataset, config.train_num, config.context_emb.name, config.optimizer.lower(), config.learning_rate) res_name = res_folder + "/lstm_{}_crf_{}_{}_dep_{}_elmo_{}_lr_{}.results".format( config.hidden_dim, config.dataset, config.train_num, config.context_emb.name, config.optimizer.lower(), config.learning_rate) print("[Info] The model will be saved to: %s" % (model_name)) if not os.path.exists(model_folder): os.makedirs(model_folder) if not os.path.exists(res_folder): os.makedirs(res_folder) for i in range(1, epoch + 1): epoch_loss = 0 start_time = time.time() model.zero_grad() if config.optimizer.lower() == "sgd": optimizer = lr_decay(config, optimizer, i) for index in np.random.permutation(len(batched_data)): # for index in range(len(batched_data)): model.train() batch_word, batch_wordlen, batch_context_emb, batch_char, batch_charlen, batch_label = batched_data[ index] loss = model.neg_log_obj(batch_word, batch_wordlen, batch_context_emb, batch_char, batch_charlen, batch_label) epoch_loss += loss.item() loss.backward() # # torch.nn.utils.clip_grad_norm_(model.parameters(), config.clip) ##clipping the gradient optimizer.step() model.zero_grad() end_time = time.time() print("Epoch %d: %.5f, Time is %.2fs" % (i, epoch_loss, end_time - start_time), flush=True) model.eval() dev_metrics = evaluate_model(config, model, dev_batches, "dev", dev_insts) test_metrics = evaluate_model(config, model, test_batches, "test", test_insts) if dev_metrics[2] > best_dev[0]: print("saving the best model...") best_dev[0] = dev_metrics[2] best_dev[1] = i best_test[0] = test_metrics[2] best_test[1] = i torch.save(model.state_dict(), model_name) write_results(res_name, test_insts) model.zero_grad() print("The best dev: %.2f" % (best_dev[0])) print("The corresponding test: %.2f" % (best_test[0])) print("Final testing.") model.load_state_dict(torch.load(model_name)) model.eval() evaluate_model(config, model, test_batches, "test", test_insts) write_results(res_name, test_insts)