def train(dataset_path, lr, epoch, batch_size, scaler_flag, state_name): train_loss_curve = [] best = -1 # load model device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') model = MyModel() model = model.to(device) model.train() # dataset and dataloader full_dataset = Visitor_Dataset(dataset_path, scaler_flag) train_dataloader = DataLoader(dataset=full_dataset, batch_size=batch_size, shuffle=True) # loss function and optimizer criterion = RMSLELoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # start training for e in range(epoch): train_loss, train_rmsle = 0.0, 0.0 print(f'\nEpoch: {e+1}/{epoch}') print('-' * len(f'Epoch: {e+1}/{epoch}')) # tqdm to disply progress bar for inputs, labels in tqdm(train_dataloader): # data from data_loader inputs = inputs.float().to(device) labels = labels.float().to(device) outputs = model(inputs) # RMSLE Loss loss = criterion(outputs, labels) # weights update optimizer.zero_grad() loss.backward() optimizer.step() # loss and rmsle calculate train_loss += loss.item() # save the best model weights as .pth file train_loss_epoch = sqrt(train_loss / len(full_dataset)) if best == -1 or train_loss_epoch < best: best_loss = train_loss_epoch best_epoch = e torch.save(model.state_dict(), f'mymodel_{state_name}.pth') print(f'Training Loss: {train_loss_epoch:.6f}') # save loss and RMSLE every epoch train_loss_curve.append(train_loss_epoch) # print the best RMSLE print(f"Final Training RMSLE Loss = {best_loss:.6f}") visualize(value=train_loss_curve, title='Train Loss Curve', filename=f'rmsle_{state_name}.png') return full_dataset
def train(lr=0.001, epoch=200, batch_size=64): train_loss_curve = [] train_wrmse_curve = [] valid_loss_curve = [] valid_wrmse_curve = [] best = 100 # load model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = MyModel() model = model.to(device) model.train() # dataset and dataloader # load data full_dataset = pd.read_csv('train.csv', encoding='utf-8') # can use torch random_split to create the validation dataset lengths = [ int(round(len(full_dataset) * 0.8)), int(round(len(full_dataset) * 0.2)) ] train_set, valid_set = random_split(full_dataset, lengths) train_dataset = MLDataset(full_dataset.iloc[train_set.indices]) valid_dataset = MLDataset(full_dataset.iloc[valid_set.indices]) train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False) # loss function and optimizer # can change loss function and optimizer you want criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # start training for e in tqdm(range(epoch)): train_loss, valid_loss = 0.0, 0.0 train_wrmse, valid_wrmse = 0.0, 0.0 print(f'\nEpoch: {e+1}/{epoch}') print('-' * len(f'Epoch: {e+1}/{epoch}')) # tqdm to disply progress bar for inputs, labels in train_dataloader: # data from data_loader inputs = inputs.float().to(device) labels = labels.float().to(device) outputs = model(inputs) # MSE loss and WRMSE loss = criterion(outputs, labels) wrmse = WRMSE(outputs, labels, device) # weights update optimizer.zero_grad() loss.backward() optimizer.step() # loss calculate train_loss += loss.item() train_wrmse += wrmse # =================================================================== # # If you have created the validation dataset, # you can refer to the for loop above and calculate the validation loss # tqdm to disply progress bar for inputs, labels in valid_dataloader: # data from data_loader inputs = inputs.float().to(device) labels = labels.float().to(device) outputs = model(inputs) # MSE loss and WRMSE loss = criterion(outputs, labels) wrmse = WRMSE(outputs, labels, device) # loss calculate valid_loss += loss.item() valid_wrmse += wrmse # =================================================================== # # save the best model weights as .pth file train_loss_epoch = train_loss / len(train_dataset) train_wrmse_epoch = math.sqrt(train_wrmse / len(train_dataset)) valid_loss_epoch = valid_loss / len(valid_dataset) valid_wrmse_epoch = math.sqrt(valid_wrmse / len(valid_dataset)) if train_wrmse_epoch < best: best_wrmse = train_wrmse_epoch best_loss = train_loss_epoch best_epoch = e torch.save(model.state_dict(), 'mymodel.pth') print(f'Training loss: {train_loss_epoch:.6f}') print(f'Training WRMSE: {train_wrmse_epoch:.6f}') print(f'Valid loss: {valid_loss_epoch:.6f}') print(f'Valid WRMSE: {valid_wrmse_epoch:.6f}') # save loss and wrmse every epoch train_loss_curve.append(train_loss_epoch) train_wrmse_curve.append(train_wrmse_epoch) valid_loss_curve.append(valid_loss_epoch) valid_wrmse_curve.append(valid_wrmse_epoch) # print the best wrmse print(f"\nBest Epoch = {best_epoch}") print(f"Best Loss = {best_loss:.4f}") print(f"Best WRMSE = {best_wrmse:.4f}\n") # generate training curve visualize(train=train_loss_curve, valid=valid_loss_curve, title='Loss Curve', filename='loss.png', best=(e, best_loss)) visualize(train_wrmse_curve, valid_wrmse_curve, title='WRMSE Curve', filename='wrmse.png', best=(e, best_wrmse), wrmse=True)
def train(lr=0.001, epoch=600, batch_size=32): train_loss_curve = [] train_wrmse_curve = [] valid_loss_curve = [] valid_wrmse_curve = [] # load model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = MyModel() model = model.to(device) model.train() # dataset and dataloader # can use torch random_split to create the validation dataset dataset = MLDataset() train_size = int(0.9 * len(dataset)) valid_size = len(dataset) - train_size train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size]) train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) # loss function and optimizer # can change loss function and optimizer you want criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) best = 100 # start training for e in range(epoch): train_loss = 0.0 train_wrmse = 0.0 valid_loss = 0.0 valid_wrmse = 0.0 print(f'\nEpoch: {e+1}/{epoch}') print('-' * len(f'Epoch: {e+1}/{epoch}')) # tqdm to disply progress bar for inputs, labels in tqdm(train_dataloader): # data from data_loader inputs = inputs.float().to(device) labels = labels.float().to(device) outputs = model(inputs) # MSE loss and WRMSE loss = criterion(outputs, labels) wrmse = WRMSE(outputs, labels, device) # weights update optimizer.zero_grad() loss.backward() optimizer.step() # loss calculate train_loss += loss.item() train_wrmse += wrmse # =================================================================== # # If you have created the validation dataset, # you can refer to the for loop above and calculate the validation loss for inputs, labels in tqdm(valid_dataloader): # data from data_loader inputs = inputs.float().to(device) labels = labels.float().to(device) outputs = model(inputs) # MSE loss and WRMSE loss = criterion(outputs, labels) wrmse = WRMSE(outputs, labels, device) # loss calculate valid_loss += loss.item() valid_wrmse += wrmse # =================================================================== # # save the best model weights as .pth file loss_epoch = train_loss / len(train_dataset) wrmse_epoch = math.sqrt(train_wrmse / len(train_dataset)) valid_loss_epoch = valid_loss / len(valid_dataset) valid_wrmse_epoch = math.sqrt(valid_wrmse / len(valid_dataset)) if valid_wrmse_epoch < best: best = valid_wrmse_epoch torch.save(model.state_dict(), 'mymodel.pth') print(f'Training loss: {loss_epoch:.4f}') print(f'Training WRMSE: {wrmse_epoch:.4f}') print(f'Valid loss: {valid_loss_epoch:.4f}') print(f'Valid WRMSE: {valid_wrmse_epoch:.4f}') # save loss and wrmse every epoch train_loss_curve.append(loss_epoch) train_wrmse_curve.append(wrmse_epoch) valid_loss_curve.append(valid_loss_epoch) valid_wrmse_curve.append(valid_wrmse_epoch) # generate training curve visualize(train_loss_curve, valid_loss_curve, 'Train Loss') visualize(train_wrmse_curve, valid_wrmse_curve, 'Train WRMSE') print("\nBest Validation WRMSE:", best)
def train( batch_size=16, pretrain_model_path='', name='', model_type='mlp', after_bert_choice='last_cls', dim=1024, lr=1e-5, epoch=12, smoothing=0.05, sample=False, #open_ad='', dialog_name='xxx'): if not pretrain_model_path or not name: assert 1 == -1 print('\n********** model type:', model_type, '**********') print('batch_size:', batch_size) # load dataset train_file = '/kaggle/input/dataset/my_train.csv' dev_file = '/kaggle/input/dataset/my_dev.csv' train_num = len(pd.read_csv(train_file).values.tolist()) val_num = len(pd.read_csv(dev_file).values.tolist()) print('train_num: %d, dev_num: %d' % (train_num, val_num)) # 选择模型 if model_type in ['siam', 'esim', 'sbert']: assert 1 == -1 else: train_iter = MyDataset(file=train_file, is_train=True, sample=sample, pretrain_model_path=pretrain_model_path) train_iter = get_dataloader(train_iter, batch_size, shuffle=True, drop_last=True) dev_iter = MyDataset(file=dev_file, is_train=True, sample=sample, pretrain_model_path=pretrain_model_path) dev_iter = get_dataloader(dev_iter, batch_size, shuffle=False, drop_last=False) if model_type == 'mlp': model = MyModel(dim=dim, pretrain_model_path=pretrain_model_path, smoothing=smoothing, after_bert_choice='last_cls') elif model_type == 'cnn': model = MyTextCNNModel(dim=dim, pretrain_model_path=pretrain_model_path, smoothing=smoothing) elif model_type == 'rcnn': model = MyRCNNModel(dim=dim, pretrain_model_path=pretrain_model_path, smoothing=smoothing) #模型加载到gpu model.to(device) model_param_num = 0 ##### 3.24 muppti-gpu-training if n_gpu > 1: model = torch.nn.DataParallel(model) for p in model.parameters(): if p.requires_grad: model_param_num += p.nelement() print('param_num:%d\n' % model_param_num) # 加入对抗训练,提升泛化能力;但是训练速度明显变慢 (插件式调用) # 3.12 change to FGM 更快! """ if open_ad == 'fgm': fgm = FGM(model) elif open_ad == 'pgd': pgd = PGD(model) K = 3 """ # model-store-path #model_path = '/kaggle/output/' + name + '.pkl' # 输出模型默认存放在当前路径下 output_dir = 'output' state = {} time0 = time.time() best_loss = 999 early_stop = 0 for e in range(epoch): print("*" * 100) print("Epoch:", e) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=len(train_iter)) # 设置优化器 train_loss = 0 train_c = 0 train_right_num = 0 model.train() # 将模型设置成训练模式(Sets the module in training mode) print('training..., %s, e:%d, lr:%7f' % (name, e, lr)) for batch in tqdm(train_iter): # 每一次返回 batch_size 条数据 optimizer.zero_grad() # 清空梯度 batch = [b.to(device) for b in batch] # cpu -> GPU # 正常训练 labels = batch[-1].view(-1).cpu().numpy() loss, bert_enc = model(batch, task='train', epoch=epoch) # 进行前向传播,真正开始训练;计算 loss right_num = count_right_num(bert_enc, labels) # multi-gpu training! if n_gpu > 1: loss = loss.mean() loss.backward() # 反向传播计算参数的梯度 #""" if open_ad == 'fgm': # 对抗训练 fgm.attack() # 在embedding上添加对抗扰动 if model_type == 'multi-task': loss_adv, _, _ = model(batch, task='train') else: loss_adv, _ = model(batch, task='train') if n_gpu > 1: loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 elif open_ad == 'pgd': pgd.backup_grad() # 对抗训练 for t in range(K): pgd.attack(is_first_attack=( t == 0 )) # 在embedding上添加对抗扰动, first attack时备份param.data if t != K - 1: optimizer.zero_grad() else: pgd.restore_grad() if model_type == 'multi-task': loss_adv, _, _ = model(batch, task='train') else: loss_adv, _ = model(batch, task='train') if n_gpu > 1: loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 #""" optimizer.step() # 更新参数 train_loss += loss.item() # loss 求和 train_c += 1 train_right_num += right_num val_loss = 0 val_c = 0 val_right_num = 0 model.eval() print('eval...') with torch.no_grad(): # 不进行梯度的反向传播 for batch in tqdm(dev_iter): # 每一次返回 batch_size 条数据 batch = [b.to(device) for b in batch] labels = batch[-1].view(-1).cpu().numpy() loss, bert_enc = model(batch, task='train', epoch=epoch) # 进行前向传播,真正开始训练;计算 loss right_num = count_right_num(bert_enc, labels) if n_gpu > 1: loss = loss.mean() val_c += 1 val_loss += loss.item() val_right_num += right_num train_acc = train_right_num / train_num val_acc = val_right_num / val_num print('train_acc: %.4f, val_acc: %.4f' % (train_acc, val_acc)) print('train_loss: %.4f, val_loss: %.4f, time: %d' % (train_loss / train_c, val_loss / val_c, time.time() - time0)) if val_loss / val_c < best_loss: early_stop = 0 best_loss = val_loss / val_c best_acc = val_acc # 3.24 update 多卡训练时模型保存避坑: if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model state['model_state'] = model_to_save.state_dict() state['loss'] = val_loss / val_c state['acc'] = val_acc state['e'] = e state['time'] = time.time() - time0 state['lr'] = lr output_model_file = os.path.join(output_dir, name + '.pkl') torch.save(state, output_model_file) #torch.save(state, model_path) best_epoch = e cost_time = time.time() - time0 tmp_train_acc = train_acc best_model = model else: early_stop += 1 if early_stop == 2: break model = best_model lr = lr * 0.5 print("best_loss:", best_loss) # 3.12 add 打印显示最终的最优结果 print('-' * 30) print('best_epoch:', best_epoch, 'best_loss:', best_loss, 'best_acc:', best_acc, 'reach time:', cost_time, '\n') # model-clean del model gc.collect() # 实验结果写入日志 """