def predict_dep(self, batch, unsort=True): dep_inputs, dep_number_of_words, dep_orig_idx, dep_sentlens = unpack_batch(batch, self.use_cuda, type="dep") dep_tokens_phobert, dep_first_subword, dep_words_mask, dep_head, dep_deprel = dep_inputs self.model.eval() batch_size = dep_tokens_phobert.size(0) loss_dep, preds = self.model.dep_forward( dep_tokens_phobert, dep_first_subword, dep_words_mask, dep_number_of_words, dep_sentlens, dep_head, dep_deprel, eval=True, ) # dependency head_seqs = [ chuliu_edmonds_one_root(adj[:l, :l])[1:] for adj, l in zip(preds[0], dep_sentlens) ] # remove attachment for the root deprel_seqs = [ self.vocab["deprel"].unmap([preds[1][i][j + 1][h] for j, h in enumerate(hs)]) for i, hs in enumerate(head_seqs) ] pred_tokens = [ [[str(head_seqs[i][j]), deprel_seqs[i][j]] for j in range(dep_sentlens[i] - 1)] for i in range(batch_size) ] if unsort: pred_tokens = util.unsort(pred_tokens, dep_orig_idx) return pred_tokens
def predict_pos(self, batch, unsort=True): pos_inputs, pos_orig_idx, pos_sentlens = unpack_batch(batch, self.use_cuda, type="pos") pos_tokens_phobert, pos_first_subword, pos_upos = pos_inputs self.model.eval() batch_size = pos_tokens_phobert.size(0) _, preds = self.model.pos_forward(pos_tokens_phobert, pos_first_subword, pos_sentlens, False, pos_upos) upos_seqs = [self.vocab["upos"].unmap(sent) for sent in preds[0].tolist()] pred_tokens = [ [[upos_seqs[i][j]] for j in range(pos_sentlens[i])] for i in range(batch_size) ] # , xpos_seqs[i][j], feats_seqs[i][j] if unsort: pred_tokens = util.unsort(pred_tokens, pos_orig_idx) return pred_tokens
def predict_ner(self, batch, unsort=True): ner_inputs, ner_orig_idx, ner_sentlens = unpack_batch(batch, self.use_cuda, type="ner") ner_tokens_phobert, ner_first_subword, ner_word_mask, ner_tags = ner_inputs self.model.eval() loss, logits = self.model.ner_forward( ner_tokens_phobert, ner_first_subword, ner_word_mask, ner_sentlens, ner_tags ) # decode trans = self.model.crit_ner._transitions.data.cpu().numpy() scores = logits.data.cpu().numpy() bs = logits.size(0) tag_seqs = [] for i in range(bs): tags, _ = viterbi_decode(scores[i, : ner_sentlens[i]], trans) tags = self.vocab["ner_tag"].unmap(tags) tag_seqs += [tags] if unsort: tag_seqs = util.unsort(tag_seqs, ner_orig_idx) return tag_seqs
def evaluate(args): # file paths system_pred_file = args["output_file_dep"] gold_file = args["eval_file_dep"] model_file = args["save_dir"] + "/" + "phonlp.pt" checkpoint = torch.load(model_file, lambda storage, loc: storage) loaded_args = checkpoint["config"] vocab = MultiVocab.load_state_dict(checkpoint["vocab"]) config_phobert = AutoConfig.from_pretrained(loaded_args["pretrained_lm"], output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained(loaded_args["pretrained_lm"], use_fast=False) # load model print("Loading model from: {}".format(model_file)) use_cuda = args["cuda"] and not args["cpu"] trainer = JointTrainer(model_file=model_file, use_cuda=use_cuda, config_phobert=config_phobert) # load data print("Loading data with batch size {}...".format(args["batch_size"])) test_doc_dep = Document(CoNLL.conll2dict(input_file=args["eval_file_dep"])) test_batch_pos = DataLoaderPOS( args["eval_file_pos"], args["batch_size"], args, vocab=vocab, sort_during_eval=True, evaluation=True, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) test_batch_dep = DataLoaderDep( test_doc_dep, args["batch_size"], args, vocab=vocab, sort_during_eval=True, evaluation=True, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) test_batch_ner = DataLoaderNER( args["eval_file_ner"], args["batch_size"], args, vocab=vocab, evaluation=True, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) print("Start evaluation...") # test_preds_dep = [] test_preds_upos = [] test_preds_ner = [] # for batch in test_batch_dep: # preds_dep = trainer.predict_dep(batch) # test_preds_dep += preds_dep # test_preds_dep = util.unsort(test_preds_dep, test_batch_dep.data_orig_idx_dep) # test_batch_dep.doc_dep.set([HEAD, DEPREL], [y for x in test_preds_dep for y in x]) # CoNLL.dict2conll(test_batch_dep.doc_dep.to_dict(), system_pred_file) # _, _, las, uas = score_dep.score(system_pred_file, gold_file) for batch in test_batch_pos: preds_pos = trainer.predict_pos(batch) test_preds_upos += preds_pos test_preds_upos = util.unsort(test_preds_upos, test_batch_pos.data_orig_idx_pos) accuracy_pos = score_pos.score_acc(test_preds_upos, test_batch_pos.upos) for batch in test_batch_ner: preds_ner = trainer.predict_ner(batch) test_preds_ner += preds_ner p, r, f1 = score_ner.score_by_entity(test_preds_ner, test_batch_ner.tags) # print( # "{} POS tagging: {:.2f}, NER: {:.2f}, Dependency parsing: {:.2f}/{:.2f}".format( # "Evaluation results: ", accuracy_pos * 100, f1 * 100, las * 100, uas * 100 # ) # ) print("{} POS tagging: {:.2f}, NER: {:.2f}".format("Evaluation results: ", accuracy_pos * 100, f1 * 100))
def train(args): util.ensure_dir(args["save_dir"]) model_file = args["save_dir"] + "/" + "phonlp.pt" tokenizer = AutoTokenizer.from_pretrained(args["pretrained_lm"], use_fast=False) config_phobert = AutoConfig.from_pretrained(args["pretrained_lm"], output_hidden_states=True) print("Loading data with batch size {}...".format(args["batch_size"])) train_doc_dep = Document( CoNLL.conll2dict(input_file=args["train_file_dep"])) vocab = BuildVocab(args, args["train_file_pos"], train_doc_dep, args["train_file_ner"]).vocab train_batch_pos = DataLoaderPOS( args["train_file_pos"], args["batch_size"], args, vocab=vocab, evaluation=False, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) train_batch_dep = DataLoaderDep( train_doc_dep, args["batch_size"], args, vocab=vocab, evaluation=False, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) train_batch_ner = DataLoaderNER( args["train_file_ner"], args["batch_size"], args, vocab=vocab, evaluation=False, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) dev_doc_dep = Document(CoNLL.conll2dict(input_file=args["eval_file_dep"])) dev_batch_pos = DataLoaderPOS( args["eval_file_pos"], args["batch_size"], args, vocab=vocab, sort_during_eval=True, evaluation=True, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) dev_batch_dep = DataLoaderDep( dev_doc_dep, args["batch_size"], args, vocab=vocab, sort_during_eval=True, evaluation=True, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) dev_batch_ner = DataLoaderNER( args["eval_file_ner"], args["batch_size"], args, vocab=vocab, evaluation=True, tokenizer=tokenizer, max_seq_length=args["max_sequence_length"], ) # pred and gold path system_pred_file = args["output_file_dep"] gold_file = args["eval_file_dep"] # ##POS dev_gold_tags = dev_batch_ner.tags # skip training if the language does not have training or dev data if len(train_batch_pos) == 0 or len(dev_batch_pos) == 0: print("Skip training because no data available...") sys.exit(0) print("Training jointmodel...") trainer = JointTrainer(args, vocab, None, config_phobert, args["cuda"]) # ### tsfm = trainer.model.phobert for child in tsfm.children(): for param in child.parameters(): if not param.requires_grad: print("whoopsies") param.requires_grad = True global_step = 0 las_score_history = 0 uas_score_history = 0 upos_score_history = 0 f1_score_history = 0 #### # start training train_loss = 0 train_loss_pos = 0 train_loss_dep = 0 train_loss_ner = 0 # Creating optimizer and lr schedulers param_optimizer = list(trainer.model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }, ] num_train_optimization_steps = int( args["num_epoch"] * len(train_batch_pos) / args["accumulation_steps"]) optimizer = AdamW( optimizer_grouped_parameters, lr=args["lr"], correct_bias=False ) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=5, num_training_steps=num_train_optimization_steps) get_constant_schedule(optimizer) for epoch in range(args["num_epoch"]): #### optimizer.zero_grad() print(" EPOCH : ", epoch) step = 0 lambda_pos = args["lambda_pos"] lambda_ner = args["lambda_ner"] lambda_dep = args["lambda_dep"] epoch_size = max( [len(train_batch_pos), len(train_batch_dep), len(train_batch_ner)]) for i in tqdm(range(epoch_size)): step += 1 global_step += 1 batch_pos = train_batch_pos[i] batch_dep = train_batch_dep[i] batch_ner = train_batch_ner[i] ### loss, loss_pos, loss_ner = trainer.update( batch_dep, batch_pos, batch_ner, lambda_pos=lambda_pos, lambda_dep=lambda_dep, lambda_ner=lambda_ner) # update step train_loss += loss train_loss_pos += loss_pos # train_loss_dep += loss_dep train_loss_ner += loss_ner ### if i % args["accumulation_steps"] == 0: optimizer.step() optimizer.zero_grad() scheduler.step() if epoch_size == len(train_batch_pos): if step % len(train_batch_dep) == 0: train_batch_dep.reshuffle() if step % len(train_batch_ner) == 0: train_batch_ner.reshuffle() elif epoch_size == len(train_batch_ner): if step % len(train_batch_dep) == 0: train_batch_dep.reshuffle() if step % len(train_batch_pos) == 0: train_batch_pos.reshuffle() elif epoch_size == len(train_batch_dep): if step % len(train_batch_pos) == 0: train_batch_dep.reshuffle() if step % len(train_batch_ner) == 0: train_batch_ner.reshuffle() if step % args["eval_interval"] == 0: print("Evaluating on dev set...") dev_preds_dep = [] dev_preds_upos = [] dev_preds_ner = [] for batch in dev_batch_dep: preds_dep = trainer.predict_dep(batch) dev_preds_dep += preds_dep ### dev_preds_dep = util.unsort(dev_preds_dep, dev_batch_dep.data_orig_idx_dep) dev_batch_dep.doc_dep.set( [HEAD, DEPREL], [y for x in dev_preds_dep for y in x]) CoNLL.dict2conll(dev_batch_dep.doc_dep.to_dict(), system_pred_file) _, _, las_dev, uas_dev = score_dep.score( system_pred_file, gold_file) for batch in dev_batch_pos: preds_pos = trainer.predict_pos(batch) dev_preds_upos += preds_pos dev_preds_upos = util.unsort(dev_preds_upos, dev_batch_pos.data_orig_idx_pos) accuracy_pos_dev = score_pos.score_acc(dev_preds_upos, dev_batch_pos.upos) for batch in dev_batch_ner: preds_ner = trainer.predict_ner(batch) dev_preds_ner += preds_ner p, r, f1 = score_ner.score_by_entity(dev_preds_ner, dev_gold_tags) for i in range(len(dev_batch_ner)): assert len(dev_preds_ner[i]) == len(dev_gold_tags[i]) print( "step {}: dev_las_score = {:.4f}, dev_uas_score = {:.4f}, dev_pos = {:.4f}, dev_ner_p = {:.4f}, dev_ner_r = {:.4f}, dev_ner_f1 = {:.4f}" .format(global_step, las_dev, uas_dev, accuracy_pos_dev, p, r, f1)) # save best model if las_dev + accuracy_pos_dev + f1 >= (las_score_history + upos_score_history + f1_score_history): las_score_history = las_dev upos_score_history = accuracy_pos_dev uas_score_history = uas_dev f1_score_history = f1 trainer.save(model_file) print("new best model saved.") print("") print("Evaluating on dev set...") dev_preds_dep = [] dev_preds_upos = [] dev_preds_ner = [] for batch in dev_batch_dep: preds_dep = trainer.predict_dep(batch) dev_preds_dep += preds_dep dev_preds_dep = util.unsort(dev_preds_dep, dev_batch_dep.data_orig_idx_dep) dev_batch_dep.doc_dep.set([HEAD, DEPREL], [y for x in dev_preds_dep for y in x]) CoNLL.dict2conll(dev_batch_dep.doc_dep.to_dict(), system_pred_file) _, _, las_dev, uas_dev = score_dep.score(system_pred_file, gold_file) for batch in dev_batch_pos: preds_pos = trainer.predict_pos(batch) dev_preds_upos += preds_pos dev_preds_upos = util.unsort(dev_preds_upos, dev_batch_pos.data_orig_idx_pos) accuracy_pos_dev = score_pos.score_acc(dev_preds_upos, dev_batch_pos.upos) for batch in dev_batch_ner: preds_ner = trainer.predict_ner(batch) dev_preds_ner += preds_ner p, r, f1 = score_ner.score_by_entity(dev_preds_ner, dev_gold_tags) for i in range(len(dev_batch_ner)): assert len(dev_preds_ner[i]) == len(dev_gold_tags[i]) train_loss = train_loss / len(train_batch_pos) # avg loss per batch train_loss_dep = train_loss_dep / len(train_batch_pos) train_loss_pos = train_loss_pos / len(train_batch_pos) train_loss_ner = train_loss_ner / len(train_batch_pos) print( "step {}: train_loss = {:.6f}, train_loss_dep = {:.6f}, train_loss_pos = {:.6f}, train_loss_ner = {:.6f}, dev_las_score = {:.4f}, dev_uas_score = {:.4f}, dev_pos = {:.4f}, dev_ner_p = {:.4f}, dev_ner_r = {:.4f}, dev_ner_f1 = {:.4f} " .format( global_step, train_loss, train_loss_dep, train_loss_pos, train_loss_ner, las_dev, uas_dev, accuracy_pos_dev, p, r, f1, )) # save best model if las_dev + accuracy_pos_dev + f1 >= ( las_score_history + upos_score_history + f1_score_history): las_score_history = las_dev upos_score_history = accuracy_pos_dev uas_score_history = uas_dev f1_score_history = f1 trainer.save(model_file) print("new best model saved.") train_loss = 0 train_loss_pos = 0 train_loss_dep = 0 train_loss_ner = 0 print("") train_batch_dep.reshuffle() train_batch_pos.reshuffle() train_batch_ner.reshuffle() print("Training ended with {} epochs.".format(epoch)) best_las, uas, upos, f1 = ( las_score_history * 100, uas_score_history * 100, upos_score_history * 100, f1_score_history * 100, ) print("Best dev las = {:.2f}, uas = {:.2f}, upos = {:.2f}, f1 = {:.2f}". format(best_las, uas, upos, f1))
def annotate(self, text=None, input_file=None, output_file=None, batch_size=1, output_type=''): if text is not None: data = [text.split(' ')] else: f = open(input_file) data = [] for line in f: line = line.strip() if len(line) != 0: data.append(line.split(' ')) f.close() print("The number of sentences: ", len(data)) data_tagger = self.process_data_tagger(batch_text=data) data_parser = self.process_data_parser(batch_text=data) data_parser = self.chunk_batches(data_parser, batch_size) data_tagger = self.chunk_batches(data_tagger, batch_size) test_preds_pos = [] test_preds_dep = [] test_preds_ner = [] for i in tqdm(range(len(data_tagger))): tokens_phobert, first_subword, words_mask, number_of_words, orig_idx, sentlens = self.get_batch( i, data_tagger) tokens_phobert1, first_subword1, words_mask1, number_of_words1, orig_idx1, sentlens1 = self.get_batch( i, data_parser) if torch.cuda.is_available(): tokens_phobert, first_subword, words_mask = tokens_phobert.cuda( ), first_subword.cuda(), words_mask.cuda() tokens_phobert1, first_subword1, words_mask1 = tokens_phobert1.cuda( ), first_subword1.cuda(), words_mask1.cuda() preds_dep = self.dep_forward(tokens_phobert1, first_subword1, sentlens1) preds_pos, logits = self.tagger_forward(tokens_phobert, first_subword, sentlens) batch_size = tokens_phobert.size(0) ##DEP head_seqs = [ chuliu_edmonds_one_root(adj[:l, :l])[1:] for adj, l in zip(preds_dep[0], sentlens1) ] # remove attachment for the root deprel_seqs = [ self.vocab['deprel'].unmap( [preds_dep[1][i][j + 1][h] for j, h in enumerate(hs)]) for i, hs in enumerate(head_seqs) ] pred_tokens = [[[str(head_seqs[i][j]), deprel_seqs[i][j]] for j in range(sentlens1[i] - 1)] for i in range(batch_size)] pred_tokens_dep = util.unsort(pred_tokens, orig_idx1) ##POS upos_seqs = [ self.vocab['upos'].unmap(sent) for sent in preds_pos[0] ] pred_tokens_pos = [[[upos_seqs[i][j]] for j in range(sentlens[i])] for i in range(batch_size) ] # , xpos_seqs[i][j], feats_seqs[i][j] pred_tokens_pos = util.unsort(pred_tokens_pos, orig_idx) trans = self.crit_ner._transitions.data.cpu().numpy() scores = logits.data.cpu().numpy() bs = logits.size(0) tag_seqs = [] for i in range(bs): tags, _ = viterbi_decode(scores[i, :sentlens[i]], trans) tags = self.vocab['ner_tag'].unmap(tags) tag_seqs += [tags] tag_seqs = util.unsort(tag_seqs, orig_idx) test_preds_ner += tag_seqs test_preds_dep += pred_tokens_dep test_preds_pos += pred_tokens_pos test_preds_dep = util.unsort(test_preds_dep, self.data_orig_idx) test_preds_pos = util.unsort(test_preds_pos, self.data_orig_idx) test_preds_ner = util.unsort(test_preds_ner, self.data_orig_idx) if text is not None: return (data, test_preds_pos, test_preds_ner, test_preds_dep) else: f = open(output_file, 'w') for i in range(len(data)): for j in range(len(data[i])): if output_type == 'conll': f.write( str(j + 1) + '\t' + data[i][j] + '\t' + '_' + '\t' + '_' + '\t' + test_preds_pos[i][j][0] + '\t' + '_' + '\t' + test_preds_dep[i][j][0] + '\t' + test_preds_dep[i][j][1] + '\t' + '_' + '\t' + test_preds_ner[i][j] + '\n') else: f.write( str(j + 1) + '\t' + data[i][j] + '\t' + test_preds_pos[i][j][0] + '\t' + test_preds_ner[i][j] + '\t' + test_preds_dep[i][j][0] + '\t' + test_preds_dep[i][j][1] + '\n') f.write('\n') f.close()