def train(model, optimizer, scheduler, train_data, dev_data, batch_size, fp16, checkpoint, gpu, max_grad_norm, best_acc): loss_fn = nn.CrossEntropyLoss() step_cnt = 0 best_model_weights = None for pointer in tqdm(range(0, len(train_data), batch_size), desc='training'): model.train( ) # model was in eval mode in evaluate(); re-activate the train mode optimizer.zero_grad() # clear gradients first torch.cuda.empty_cache() # releases all unoccupied cached memory step_cnt += 1 sent_pairs = [] labels = [] for i in range(pointer, pointer + batch_size): if i >= len(train_data): break sents = train_data[i].get_texts() if len(word_tokenize(' '.join(sents))) > 300: continue sent_pairs.append(sents) labels.append(train_data[i].get_label()) logits, _ = model.ff(sent_pairs, checkpoint) if logits is None: continue true_labels = torch.LongTensor(labels) if gpu: true_labels = true_labels.to('cuda') loss = loss_fn(logits, true_labels) # back propagate if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # update weights optimizer.step() # update training rate scheduler.step() if step_cnt % 2000 == 0: acc = evaluate(model, dev_data, checkpoint, mute=True) logging.info('==> step {} dev acc: {}'.format(step_cnt, acc)) if acc > best_acc: best_acc = acc best_model_weights = copy.deepcopy(model.cpu().state_dict()) model.to('cuda') return best_model_weights
"Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') best_acc = -1. best_model_dic = None for ep in range(epoch_num): logging.info('\n=====epoch {}/{}====='.format(ep, epoch_num)) model_dic = train(model, optimizer, scheduler, train_data, dev_data, batch_size, fp16, checkpoint, gpu, max_grad_norm, best_acc) if model_dic is not None: best_model_dic = model_dic assert best_model_dic is not None # for testing load the best model model.load_model(best_model_dic) logging.info('\n=====Training finished. Now start test=====') test_data = nli_reader.get_examples('dev.gz') #,max_examples=50) logging.info('test data size: {}'.format(len(test_data))) test_acc = evaluate(model, test_data, batch_size) logging.info('accuracy on test set: {}'.format(test_acc)) if model_save_path is not None: os.makedirs(model_save_path, exist_ok=True) if os.listdir(model_save_path): raise ValueError( "Output directory ({}) already exists and is not empty.". format(model_save_path)) model.save(model_save_path, best_model_dic, test_acc)
if model_dic is not None: best_model_dic = model_dic assert best_model_dic is not None # for testing load the best model model.load_model(best_model_dic) logging.info('\n=====Training finished. Now start test=====') if hans: nli_reader = NLIDataReader('datasets/Hans') hans_test_data = nli_reader.get_hans_examples('heuristics_evaluation_set.txt') else: hans_test_data = [] nli_reader = NLIDataReader('datasets/SUFE') msnli_test_data = nli_reader.get_examples('dev.gz') #,max_examples=50) test_data = msnli_test_data + hans_test_data logging.info('test data size: {}'.format(len(test_data))) test_acc = evaluate(model, test_data, checkpoint, mute=True) # test_acc = evaluate(model,test_data,batch_size) logging.info('accuracy on test set: {}'.format(test_acc)) if model_save_path is not None: os.makedirs(model_save_path, exist_ok=True) if os.listdir(model_save_path): raise ValueError("Output directory ({}) already exists and is not empty.".format( model_save_path)) model.save(model_save_path,best_model_dic,test_acc)