epoch_loss = 0 model.train() if dataset_loader_crf: for f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v, SCRF_labels, mask_SCRF_labels, cnn_features in tqdm( itertools.chain.from_iterable(dataset_loader_crf), mininterval=2, desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stderr): f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, SCRF_labels, mask_SCRF_labels, cnn_features = packer.repack(f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v, SCRF_labels, mask_SCRF_labels, cnn_features, test=False) optimizer.zero_grad() loss = model(f_f, f_p, b_f, b_p, w_f, cnn_features, tg_v, mask_v, mask_v.long().sum(0), SCRF_labels, mask_SCRF_labels, onlycrf=True) epoch_loss += utils.to_scalar(loss) loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.clip_grad) optimizer.step() for f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v, SCRF_labels, mask_SCRF_labels, cnn_features in tqdm( itertools.chain.from_iterable(dataset_loader), mininterval=2, desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stderr): f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, SCRF_labels, mask_SCRF_labels, cnn_features = packer.repack(f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v, SCRF_labels, mask_SCRF_labels, cnn_features, test=False) optimizer.zero_grad() loss = model(f_f, f_p, b_f, b_p, w_f, cnn_features, tg_v, mask_v, mask_v.long().sum(0), SCRF_labels, mask_SCRF_labels, onlycrf=False)
epoch_loss = 0 ner_model.train() for feature, tg, mask in tqdm( itertools.chain.from_iterable(dataset_loader), mininterval=2, desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stdout): fea_v, tg_v, mask_v = packer.repack_vb(feature, tg, mask) ner_model.zero_grad() scores, hidden = ner_model.forward(fea_v) loss = crit.forward(scores, tg_v, mask_v) loss.backward() nn.utils.clip_grad_norm(ner_model.parameters(), args.clip_grad) optimizer.step() epoch_loss += utils.to_scalar(loss) # update lr utils.adjust_learning_rate(optimizer, args.lr / (1 + (args.start_epoch + 1) * args.lr_decay)) # average epoch_loss /= tot_length # eval & save check_point if 'f' in args.eva_matrix: dev_f1, dev_pre, dev_rec, dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader) if dev_f1 > best_f1: patience_count = 0 best_f1 = dev_f1
def train_epoch(self, cur_dataset, crf_no, crit_ner, optimizer, args): #cur_dataset = crf2train_dataloader[crf_no] self.ner_model.train() epoch_loss = 0 num_sample = sum(map(lambda t: len(t), cur_dataset)) train_corpus = [ args.train_file[i].split("/")[-2] for i in self.crf2corpus[crf_no] ] print("Epoch: [{:d}/{:d}]".format(args.start_epoch, args.epoch - 1)) print("Train corpus: ", train_corpus) if not args.idea[:2] in ['P2', 'P3']: data_iter = itertools.chain.from_iterable(cur_dataset) else: data_iter = iter(cur_dataset) for f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v, corpus_mask_v, reorder in tqdm( data_iter, mininterval=2, desc=' - Total it %d' % (num_sample), leave=False, file=sys.stdout): if args.idea[:2] not in ['P2', 'P3']: f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, corpus_mask_v = self.packer.repack_vb( f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v, corpus_mask_v) else: if args.idea in ['P23', 'P33']: proba_dist, tg_v = tg_v f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v, corpus_mask_v, reorder = f_f.cuda( ), f_p.cuda(), b_f.cuda(), b_p.cuda(), w_f.cuda(), tg_v.cuda( ), mask_v.cuda(), len_v.cuda(), corpus_mask_v.cuda( ), reorder.cuda() self.ner_model.zero_grad() scores = self.ner_model(f_f, f_p, b_f, b_p, w_f, crf_no, corpus_mask_v) if args.idea in ['P23', 'P33']: loss = crit_ner(scores, [proba_dist, tg_v], mask_v, corpus_mask_v, idea=args.idea, sigmoid=args.sigmoid, mask_value=args.mask_value) else: loss = crit_ner(scores, tg_v, mask_v, corpus_mask_v, idea=args.idea, sigmoid=args.sigmoid, mask_value=args.mask_value) epoch_loss += utils.to_scalar(loss) if args.co_train: cf_p = f_p[0:-1, :].contiguous() cb_p = b_p[1:, :].contiguous() cf_y = w_f[1:, :].contiguous() cb_y = w_f[0:-1, :].contiguous() cfs, _ = self.ner_model.word_pre_train_forward(f_f, cf_p) loss = loss + args.lambda0 * self.crit_lm(cfs, cf_y.view(-1)) cbs, _ = self.ner_model.word_pre_train_backward(b_f, cb_p) loss = loss + args.lambda0 * self.crit_lm(cbs, cb_y.view(-1)) loss.backward() nn.utils.clip_grad_norm(self.ner_model.parameters(), args.clip_grad) optimizer.step() epoch_loss = epoch_loss / num_sample self.sample_cnter[crf_no] += 1 print("training loss: {:.4f}".format(epoch_loss)) return epoch_loss
def train_a_epoch(name, data, tag_idx, is_oov, model, optimizer, seq_criterion, lm_f_criterion, lm_b_criterion, att_loss, gamma): evaluator = Evaluator(name, [0, 1], main_label_name=cfg.POSITIVE_LABEL, label2id=tag_idx, conll_eval=True) t = tqdm(data, total=len(data)) if is_oov[0] == 1: print("Yes, UNKNOWN token is out of vocab") else: print("No, UNKNOWN token is not out of vocab") for SENT, X, C, POS, Y, P in t: batch_size = len(SENT) # zero the parameter gradients optimizer.zero_grad() model.zero_grad() model.init_state(len(X)) x_var, c_var, pos_var, y_var, lm_X = to_variables(X=X, C=C, POS=POS, Y=Y) np.set_printoptions(threshold=np.nan) if cfg.CHAR_LEVEL == "Attention": lm_f_out, lm_b_out, seq_out, seq_lengths, emb, char_emb = model( x_var, c_var) unrolled_x_var = list(chain.from_iterable(x_var)) not_oov_seq = [-1 if is_oov[idx] else 1 for idx in unrolled_x_var] char_att_loss = att_loss( emb.detach(), char_emb, Variable(torch.cuda.LongTensor(not_oov_seq))) / batch_size else: lm_f_out, lm_b_out, seq_out, seq_lengths = model(x_var, c_var) logger.debug("lm_f_out : {0}".format(lm_f_out)) logger.debug("lm_b_out : {0}".format(lm_b_out)) logger.debug("seq_out : {0}".format(seq_out)) logger.debug("tensor X variable: {0}".format(x_var)) # remove start and stop tags pred = argmax(seq_out) logger.debug("Predicted output {0}".format(pred)) seq_loss = seq_criterion( seq_out, Variable(torch.LongTensor(y_var)).cuda()) / batch_size # to limit the vocab size of the sample sentence ( trick used to improve lm model) # TODO make sure that start and end symbol of sentence gets through this filtering. logger.debug("Sample input {0}".format(lm_X)) if gamma != 0: lm_X_f = [x1d[1:] for x1d in lm_X] lm_X_b = [x1d[:-1] for x1d in lm_X] lm_X_f = list(chain.from_iterable(lm_X_f)) lm_X_b = list(chain.from_iterable(lm_X_b)) lm_f_loss = lm_f_criterion( lm_f_out.squeeze(), Variable(cuda.LongTensor(lm_X_f)).squeeze()) / batch_size lm_b_loss = lm_b_criterion( lm_b_out.squeeze(), Variable(cuda.LongTensor(lm_X_b)).squeeze()) / batch_size if cfg.CHAR_LEVEL == "Attention": total_loss = seq_loss + Variable(cuda.FloatTensor( [gamma])) * (lm_f_loss + lm_b_loss) + char_att_loss else: total_loss = seq_loss + Variable(cuda.FloatTensor( [gamma])) * (lm_f_loss + lm_b_loss) else: if cfg.CHAR_LEVEL == "Attention": total_loss = seq_loss + char_att_loss else: total_loss = seq_loss desc = "total_loss: {0:.4f} = seq_loss: {1:.4f}".format( to_scalar(total_loss), to_scalar(seq_loss)) if gamma != 0: desc += " + gamma: {0} * (lm_f_loss: {1:.4f} + lm_b_loss: {2:.4f})".format( gamma, to_scalar(lm_f_loss), to_scalar(lm_b_loss)) if cfg.CHAR_LEVEL == "Attention": desc += " + char_att_loss: {0:.4f}".format( to_scalar(char_att_loss)) t.set_description(desc) preds = roll(pred, seq_lengths) for pred, x, y in zip(preds, X, Y): evaluator.append_data(to_scalar(total_loss), pred, x, y) total_loss.backward() if cfg.CLIP is not None: clip_grad_norm(model.parameters(), cfg.CLIP) optimizer.step() evaluator.classification_report() return evaluator, model
if args.co_train: cf_p = f_p[0:-1, :].contiguous() cb_p = b_p[1:, :].contiguous() cf_y = w_f[1:, :].contiguous() cb_y = w_f[0:-1, :].contiguous() cfs, _ = ner_model.word_pre_train_forward(f_f, cf_p) cbs, _ = ner_model.word_pre_train_backward(b_f,cb_p) cfs_loss=args.lambda0 * crit_lm(cfs, cf_y.view(-1)) cbs_loss=args.lambda0 * crit_lm(cbs, cb_y.view(-1)) lm_loss+=utils.to_scalar(cfs_loss)+utils.to_scalar(cbs_loss) if i=='train': crf_loss+=utils.to_scalar(loss) loss = loss + cfs_loss else: loss=cfs_loss loss = loss + cbs_loss epoch_loss+=utils.to_scalar(loss) loss.backward() nn.utils.clip_grad_norm_(ner_model.parameters(), args.clip_grad) optimizer.step()
def train(self, data, *args, **kwargs): tot_length = sum(map(lambda t: len(t), self.dataset_loader)) loss_list = [] acc_list = [] best_f1 = [] for i in range(self.file_num): best_f1.append(float('-inf')) best_pre = [] for i in range(self.file_num): best_pre.append(float('-inf')) best_rec = [] for i in range(self.file_num): best_rec.append(float('-inf')) start_time = time.time() epoch_list = range(self.args.start_epoch, self.args.start_epoch + self.args.epoch) patience_count = 0 for epoch_idx, self.args.start_epoch in enumerate(epoch_list): sample_num = 1 epoch_loss = 0 self.ner_model.train() for sample_id in tqdm(range(sample_num), mininterval=2, desc=' - Tot it %d (epoch %d)' % (tot_length, self.args.start_epoch), leave=False, file=sys.stdout): self.file_no = random.randint(0, self.file_num - 1) cur_dataset = self.dataset_loader[self.file_no] for f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v in itertools.chain.from_iterable( cur_dataset): f_f, f_p, b_f, b_p, w_f, tg_v, mask_v = self.packer.repack_vb( f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v) self.ner_model.zero_grad() scores = self.ner_model(f_f, f_p, b_f, b_p, w_f, self.file_no) loss = self.crit_ner(scores, tg_v, mask_v) epoch_loss += utils.to_scalar(loss) if self.args.co_train: cf_p = f_p[0:-1, :].contiguous() cb_p = b_p[1:, :].contiguous() cf_y = w_f[1:, :].contiguous() cb_y = w_f[0:-1, :].contiguous() cfs, _ = self.ner_model.word_pre_train_forward( f_f, cf_p) loss = loss + self.args.lambda0 * self.crit_lm( cfs, cf_y.view(-1)) cbs, _ = self.ner_model.word_pre_train_backward( b_f, cb_p) loss = loss + self.args.lambda0 * self.crit_lm( cbs, cb_y.view(-1)) loss.backward() nn.utils.clip_grad_norm(self.ner_model.parameters(), self.args.clip_grad) self.optimizer.step() epoch_loss /= tot_length # update lr utils.adjust_learning_rate( self.optimizer, self.args.lr / (1 + (self.args.start_epoch + 1) * self.args.lr_decay)) # eval & save check_point if 'f' in self.args.eva_matrix: dev_f1, dev_pre, dev_rec, dev_acc = self.evaluate( None, None, self.dev_dataset_loader[self.file_no], self.file_no) loss_list.append(epoch_loss) acc_list.append(dev_acc) if dev_f1 > best_f1[self.file_no]: patience_count = 0 best_f1[self.file_no] = dev_f1 best_pre[self.file_no] = dev_pre best_rec[self.file_no] = dev_rec self.track_list.append({ 'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc }) print( '(loss: %.4f, epoch: %d, dataset: %d, dev F1 = %.4f, dev pre = %.4f, dev rec = %.4f)' % (epoch_loss, self.args.start_epoch, self.file_no, dev_f1, dev_pre, dev_rec)) try: self.save_model(None) except Exception as inst: print(inst) else: patience_count += 1 print( '(loss: %.4f, epoch: %d, dataset: %d, dev F1 = %.4f, dev pre = %.4f, dev rec = %.4f)' % (epoch_loss, self.args.start_epoch, self.file_no, dev_f1, dev_pre, dev_rec)) self.track_list.append({ 'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc }) print('epoch: ' + str(self.args.start_epoch) + '\t in ' + str(self.args.epoch) + ' take: ' + str(time.time() - start_time) + ' s') if patience_count >= self.args.patience and self.args.start_epoch >= self.args.least_iters: break return loss_list, acc_list