def train(): device = Config.device # 准备数据 train_data, dev_data = build_dataset(Config) train_iter = DatasetIterater(train_data, Config) dev_iter = DatasetIterater(dev_data, Config) model = Model().to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) # 这里我们用bertAdam优化器 optimizer = BertAdam(optimizer_grouped_parameters, lr=Config.learning_rate, warmup=0.05, t_total=len(train_iter) * Config.num_epochs) model.to(device) model.train() best_loss = 100000.0 for epoch in range(Config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, Config.num_epochs)) for step, batch in enumerate(train_iter): input_ids, input_mask, start_positions, end_positions = \ batch[0], batch[1], batch[2], batch[3] input_ids, input_mask, start_positions, end_positions = \ input_ids.to(device), input_mask.to(device), start_positions.to(device), end_positions.to(device) loss, _, _ = model(input_ids, attention_mask=input_mask, start_positions=start_positions, end_positions=end_positions) loss.backward() optimizer.step() print('epoch:{}, step:{}, loss:{}') train_loss.append(loss) if step % 100 == 0: eval_loss = evaluate(model, dev_iter) if eval_loss < best_loss: best_loss = eval_loss torch.save(model.state_dict(), './save_model/'+'best_model') model.train()
def train(): device = Config.device # 准备数据 train_data, dev_data = build_dataset(Config) train_iter = DatasetIterater(train_data, Config) dev_iter = DatasetIterater(dev_data, Config) model = Model().to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) # 这里我们用bertAdam优化器 optimizer = AdamW( optimizer_grouped_parameters, lr=Config.learning_rate, correct_bias=False) # 要重现BertAdam特定的行为,请设置correct_bias = False scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0.05, num_training_steps=len(train_iter) * Config.num_epochs) # PyTorch调度程序用法如下: model.to(device) model.train() best_loss = 100000.0 for epoch in range(Config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, Config.num_epochs)) for step, batch in enumerate(train_iter): start_time = time.time() ids, input_ids, input_mask, start_positions, end_positions = \ batch[0], batch[1], batch[2], batch[3], batch[4] input_ids, input_mask, start_positions, end_positions = \ input_ids.to(device), input_mask.to(device), start_positions.to(device), end_positions.to(device) # print(input_ids.size()) # print(input_mask.size()) # print(start_positions.size()) # print(end_positions.size()) loss, _, _ = model(input_ids, attention_mask=input_mask, start_positions=start_positions, end_positions=end_positions) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm=20) optimizer.step() scheduler.step() time_str = datetime.datetime.now().isoformat() log_str = 'time:{}, epoch:{}, step:{}, loss:{:8f}, spend_time:{:6f}'.format( time_str, epoch, step, loss, time.time() - start_time) rainbow(log_str) train_loss.append(loss) if epoch % 1 == 0: eval_loss = valid(model, dev_iter) if eval_loss < best_loss: best_loss = eval_loss torch.save(model.state_dict(), './save_model/' + 'best_model.bin') model.train()