def __init__( self, args, batch_size=None, device=None, early_stop=None, learning_rate=None, ): super().__init__() if args is not None: # must assign values if args.batch_size is not None: batch_size = args.batch_size if args.device is not None: device = args.device if args.early_stop is not None: early_stop = args.early_stop if args.learning_rate is not None: learning_rate = args.learning_rate if batch_size is None: batch_size = batch_size_default if device is None: device = device_default if early_stop is None: early_stop = early_stop_default if learning_rate is None: learning_rate = learning_rate_default self.batch_size = batch_size self.device = get_device(device, gpuid_default) self.early_stop = early_stop self.learning_rate = learning_rate self.clip_norm = clip_norm_default self.log_per_batch = log_per_batch_default self.n_chars = n_chars_default self.char_dim = char_dim_default self.log_name = log_name_default self.train_url = train_url_default self.dev_url = dev_url_default self.test_url = test_url_default self.n_epochs = n_epochs_default self.models_path = models_path_default # log the parameters self.logger = get_logger(self.log_name) self.logger.info("batch_size is: %d", self.batch_size) self.logger.info("device is: %s", self.device.type) self.logger.info("early_stop is: %d", self.early_stop) self.logger.info("learning_rate is: %.5f", self.learning_rate)
def train(n_epochs=30, embedding_url=None, char_feat_dim=50, freeze=False, train_url=TRAIN_URL, dev_url=DEV_URL, test_url=None, max_region=10, learning_rate=0.001, batch_size=100, early_stop=5, clip_norm=5, device='auto', save_only_best = True ): """ Train deep exhaustive model, Sohrab et al. 2018 EMNLP Args: n_epochs: number of epochs embedding_url: url to pretrained embedding file, set as None to use random embedding char_feat_dim: size of character level feature freeze: whether to freeze embedding train_url: url to train data dev_url: url to dev data test_url: url to test data for evaluating, set to None for not evaluating max_region: max entity region size learning_rate: learning rate batch_size: batch_size early_stop: early stop for training clip_norm: whether to perform norm clipping, set to 0 if not need device: device for torch save_only_best: only save model of best performance """ # print arguments arguments = json.dumps(vars(), indent=2) print("exhaustive model is training with arguments", arguments) device = get_device(device) train_set = ExhaustiveDataset(train_url, device=device, max_region=max_region) train_loader = DataLoader(train_set, batch_size=batch_size, drop_last=False, collate_fn=train_set.collate_func) vocab = ju.load(VOCAB_URL) n_words = len(vocab) char_vocab = ju.load(VOCAB_URL.replace('vocab', 'char_vocab')) n_chars = len(char_vocab) model = ExhaustiveModel( hidden_size=200, n_tags=train_set.n_tags + 1, char_feat_dim=char_feat_dim, embedding_url=embedding_url, bidirectional=True, max_region=max_region, n_embeddings=n_words, n_chars = n_chars, embedding_dim=200, freeze=freeze ) if device.type == 'cuda': print("using gpu,", torch.cuda.device_count(), "gpu(s) available!\n") # model = nn.DataParallel(model) else: print("using cpu\n") model = model.to(device) criterion = F.cross_entropy optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) max_f1, max_f1_epoch, cnt = 0, 0, 0 # ignore the padding part when calcuting loss tag_weights = torch.Tensor([1] * train_set.n_tags + [0]).to(device) best_model_url = None # train and evaluate model for epoch in range(n_epochs): # switch to train mode model.train() batch_id = 0 for data, labels, _ in train_loader: optimizer.zero_grad() outputs = model.forward(*data) # use weight parameter to skip padding part loss = criterion(outputs, labels, weight=tag_weights) loss.backward() # gradient clipping if clip_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm) optimizer.step() endl = '\n' if batch_id % LOG_PER_BATCH == 0 else '\r' sys.stdout.write("epoch #%d, batch #%d, loss: %.6f, %s%s" % (epoch, batch_id, loss.item(), datetime.now().strftime("%X"), endl)) sys.stdout.flush() batch_id += 1 cnt += 1 # metrics on development set dev_metrics = evaluate(model, dev_url) if dev_metrics['f1'] > max_f1: max_f1 = dev_metrics['f1'] max_f1_epoch = epoch if save_only_best and best_model_url: os.remove(best_model_url) best_model_url = from_project_root( "data/model/exhaustive_model_epoch%d_%f.pt" % (epoch, max_f1)) torch.save(model, best_model_url) cnt = 0 print("maximum of f1 value: %.6f, in epoch #%d\n" % (max_f1, max_f1_epoch)) if cnt >= early_stop > 0: break print('\n') if test_url and best_model_url: model = torch.load(best_model_url) print("best model url:", best_model_url) print("evaluating on test dataset:", test_url) evaluate(model, test_url) print(arguments)
def train_end2end(n_epochs=80, embedding_url=EMBED_URL, char_feat_dim=50, freeze=FREEZE_WV, train_url=TRAIN_URL, dev_url=DEV_URL, test_url=None, learning_rate=LR, batch_size=BATCH_SIZE, early_stop=EARLY_STOP, clip_norm=MAX_GRAD_NORM, bsl_model_url=None, gamma=0.3, device='auto', save_only_best=True): """ Train deep exhaustive model, trained best model will be saved at 'data/model/' Args: n_epochs: number of epochs embedding_url: url to pre-trained embedding file, set as None to use random embedding char_feat_dim: size of character level feature freeze: whether to freeze embedding train_url: url to train data dev_url: url to dev data test_url: urt to test data learning_rate: learning rate batch_size: batch_size early_stop: early stop for training clip_norm: whether to perform norm clipping, set to 0 if not need bsl_model_url: pre-trained sequence labeler url gamma: percentage of region classification module loss in total loss device: device for torch save_only_best: only save model of best performance """ # print arguments arguments = json.dumps(vars(), indent=2) print("arguments", arguments) start_time = datetime.now() device = get_device(device) train_set = End2EndDataset(train_url, device=device) train_loader = DataLoader(train_set, batch_size=batch_size, drop_last=False, collate_fn=train_set.collate_func) model = End2EndModel(hidden_size=200, lstm_layers=1, n_tags=N_TAGS, char_feat_dim=char_feat_dim, embedding_url=embedding_url, bidirectional=True, n_embeddings=200000, embedding_dim=200, freeze=freeze) if device.type == 'cuda': print("using gpu,", torch.cuda.device_count(), "gpu(s) available!\n") # model = nn.DataParallel(model) else: print("using cpu\n") model = model.to(device) bsl_model = torch.load(bsl_model_url) if bsl_model_url else None criterion = F.cross_entropy optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) cnt = 0 max_f1, max_f1_epoch = 0, 0 best_model_url = None for epoch in range(n_epochs): # switch to train mode model.train() batch_id = 0 for data, sentence_labels, region_labels in train_loader: optimizer.zero_grad() pred_region_labels, pred_sentence_labels = model.forward( *data, sentence_labels) classification_loss = criterion(pred_region_labels, region_labels) bsl_loss = criterion(pred_sentence_labels, sentence_labels) if bsl_model_url: # train condition region classifier alone loss = classification_loss else: # train region classifier and binary sequence labeler as a multitask learning loss = gamma * classification_loss + (1 - gamma) * bsl_loss loss.backward() # gradient clipping if clip_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm) optimizer.step() if batch_id % LOG_PER_BATCH == 0: print("epoch #%d, batch #%d, loss: %.12f, %s" % (epoch, batch_id, loss.item(), datetime.now().strftime("%X"))) batch_id += 1 cnt += 1 # evaluating model use development dataset or and additional test dataset precision, recall, f1 = evaluate_e2e(model, dev_url, bsl_model).values() if f1 > max_f1: max_f1, max_f1_epoch = f1, epoch name = 'split' if bsl_model else 'end2end' if save_only_best and best_model_url: os.remove(best_model_url) best_model_url = from_project_root( "data/model/%s_model_epoch%d_%f.pt" % (name, epoch, f1)) torch.save(model, best_model_url) cnt = 0 # if test_url: # evaluate_e2e(model, test_url, bsl_model) print("maximum of f1 value: %.6f, in epoch #%d" % (max_f1, max_f1_epoch)) print("training time:", str(datetime.now() - start_time).split('.')[0]) print(datetime.now().strftime("%c\n")) if cnt >= early_stop > 0: break if test_url: best_model = torch.load(best_model_url) print("best model url:", best_model_url) print("evaluating on test dataset:", test_url) evaluate_e2e(best_model, test_url, bsl_model) print(arguments)