def train_alphaBert_stage1(TS_model, dloader, testloader, lr=1e-4, epoch=10, log_interval=20, cloze_fix=True, use_amp=False, lkahead=False, parallel=True): global checkpoint_file TS_model.to(device) # model_optimizer = optim.Adam(TS_model.parameters(), lr=lr) # if lkahead: # print('using Lookahead') # model_optimizer = lookahead_pytorch.Lookahead(model_optimizer, la_steps=5, la_alpha=0.5) model_optimizer = Ranger(TS_model.parameters(), lr=lr) if use_amp: TS_model, model_optimizer = amp.initialize(TS_model, model_optimizer, opt_level="O1") if parallel: TS_model = torch.nn.DataParallel(TS_model) # torch.distributed.init_process_group(backend='nccl', # init_method='env://host', # world_size=0, # rank=0, # store=None, # group_name='') # TS_model = DDP(TS_model) # TS_model = apex.parallel.DistributedDataParallel(TS_model) TS_model.train() # criterion = alphabert_loss.Alphabert_satge1_loss(device=device) criterion = nn.CrossEntropyLoss(ignore_index=-1).to(device) iteration = 0 total_loss = [] out_pred_res = [] out_pred_test = [] for ep in range(epoch): t0 = time.time() # step_loss = 0 epoch_loss = 0 epoch_cases = 0 for batch_idx, sample in enumerate(dloader): # TS_model.train() model_optimizer.zero_grad() loss = 0 src = sample['src_token'] trg = sample['trg'] att_mask = sample['mask_padding'] origin_len = sample['origin_seq_length'] bs, max_len = src.shape # src, err_cloze = make_cloze(src, # max_len, # device=device, # percent=0.15, # fix=cloze_fix) src = src.float().to(device) trg = trg.long().to(device) att_mask = att_mask.float().to(device) origin_len = origin_len.to(device) prediction_scores, = TS_model(input_ids=src, attention_mask=att_mask) # print(1111,prediction_scores.view(-1,84).shape) # print(1111,trg.view(-1).shape) loss = criterion( prediction_scores.view(-1, 100).contiguous(), trg.view(-1).contiguous()) if use_amp: with amp.scale_loss(loss, model_optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() model_optimizer.step() with torch.no_grad(): epoch_loss += loss.item() * bs epoch_cases += bs if iteration % log_interval == 0: print('Ep:{} [{} ({:.0f}%)/ ep_time:{:.0f}min] L:{:.4f}'. format(ep, batch_idx * batch_size, 100. * batch_idx / len(dloader), (time.time() - t0) * len(dloader) / (60 * (batch_idx + 1)), loss.item())) if iteration % 400 == 0: save_checkpoint(checkpoint_file, 'd2s_total.pth', TS_model, model_optimizer, parallel=parallel) a_ = tokenize_alphabets.convert_idx2str( src[0][:origin_len[0]]) print(a_) print(' ******** ******** ******** ') _, show_pred = torch.max(prediction_scores[0], dim=1) err_cloze_ = trg[0] > -1 src[0][err_cloze_] = show_pred[err_cloze_].float() b_ = tokenize_alphabets.convert_idx2str( src[0][:origin_len[0]]) print(b_) print(' ******** ******** ******** ') src[0][err_cloze_] = trg[0][err_cloze_].float() c_ = tokenize_alphabets.convert_idx2str( src[0][:origin_len[0]]) print(c_) out_pred_res.append((ep, a_, b_, c_, err_cloze_)) out_pd_res = pd.DataFrame(out_pred_res) out_pd_res.to_csv('./result/out_pred_train.csv', sep=',') if iteration % 999 == 0: print(' ===== Show the Test of Pretrain ===== ') test_res = test_alphaBert_stage1(TS_model, testloader) print(' ===== Show the Test of Pretrain ===== ') out_pred_test.append((ep, *test_res)) out_pd_test = pd.DataFrame(out_pred_test) out_pd_test.to_csv('./result/out_pred_test.csv', sep=',') iteration += 1 if ep % 1 == 0: save_checkpoint(checkpoint_file, 'd2s_total.pth', TS_model, model_optimizer, parallel=parallel) print('======= epoch:%i ========' % ep) print('++ Ep Time: {:.1f} Secs ++'.format(time.time() - t0)) total_loss.append(float(epoch_loss / epoch_cases)) pd_total_loss = pd.DataFrame(total_loss) pd_total_loss.to_csv('./result/total_loss_pretrain.csv', sep=',') print(total_loss)
preds = preds.log_softmax(dim=self.dim) return torch.mean(torch.sum(-one_hot_target * preds, dim=self.dim)) # loss_fn = nn.CrossEntropyLoss() loss_fn = LabelSmoothingLoss() device = torch.device('cuda:1') from ranger import Ranger epochs = 20 patience = 15 #opt = torch.optim.AdamW(model.parameters(), lr=3e-4) #opt = Lookahead(opt) opt = Ranger(model.parameters(), lr=3e-4) model = model.to(device) rolling_loss = dict(train=RollingLoss(), valid=RollingLoss()) steps = dict(train=0, valid=0) trials = 0 best_metric = -np.inf history = [] stop = False vis = Visdom(server='0.0.0.0', port=9090, username=os.environ['VISDOM_USERNAME'], password=os.environ['VISDOM_PASSWORD']) # loaders = create_loaders(batch_size=7)
def train_alphaBert(DS_model, dloader, lr=1e-4, epoch=10, log_interval=20, lkahead=False): global checkpoint_file DS_model.to(device) # model_optimizer = optim.Adam(DS_model.parameters(), lr=lr) model_optimizer = Ranger(DS_model.parameters(), lr=lr) DS_model = torch.nn.DataParallel(DS_model) DS_model.train() # if lkahead: # print('using Lookahead') # model_optimizer = lookahead_pytorch.Lookahead(model_optimizer, la_steps=5, la_alpha=0.5) # model_optimizer = Ranger(DS_model.parameters(), lr=4e-3, alpha=0.5, k=5) # criterion = nn.MSELoss().to(device) # criterion = alphabert_loss_v02.Alphabert_loss(device=device) criterion = nn.CrossEntropyLoss(ignore_index=-1).to(device) iteration = 0 total_loss = [] for ep in range(epoch): DS_model.train() t0 = time.time() # step_loss = 0 epoch_loss = 0 epoch_cases = 0 for batch_idx, sample in enumerate(dloader): model_optimizer.zero_grad() loss = 0 src = sample['src_token'] trg = sample['trg'] att_mask = sample['mask_padding'] origin_len = sample['origin_seq_length'] bs = len(src) src = src.float().to(device) trg = trg.long().to(device) att_mask = att_mask.float().to(device) origin_len = origin_len.to(device) pred_prop, = DS_model(input_ids=src, attention_mask=att_mask, out='finehead') trg_view = trg.view(-1).contiguous() trg_mask0 = trg_view == 0 trg_mask1 = trg_view == 1 loss = criterion(pred_prop, trg_view) # try: # loss0 = criterion(pred_prop[trg_mask0],trg_view[trg_mask0]) # loss1 = criterion(pred_prop[trg_mask1],trg_view[trg_mask1]) # # loss += 0.2*loss0+0.8*loss1 # except: # loss = criterion(pred_prop,trg.view(-1).contiguous()) loss.backward() model_optimizer.step() with torch.no_grad(): epoch_loss += loss.item() * bs epoch_cases += bs if iteration % log_interval == 0: # step_loss.backward() # model_optimizer.step() # print('+++ update +++') print( 'Ep:{} [{} ({:.0f}%)/ ep_time:{:.0f}min] L:{:.4f}'.format( ep, batch_idx * batch_size, 100. * batch_idx / len(dloader), (time.time() - t0) * len(dloader) / (60 * (batch_idx + 1)), loss.item())) # print(0,st_target) # step_loss = 0 if iteration % 400 == 0: save_checkpoint(checkpoint_file, 'd2s_total.pth', DS_model, model_optimizer, parallel=parallel) print( tokenize_alphabets.convert_idx2str(src[0][:origin_len[0]])) iteration += 1 if ep % 1 == 0: save_checkpoint(checkpoint_file, 'd2s_total.pth', DS_model, model_optimizer, parallel=parallel) # test_alphaBert(DS_model,D2S_valloader, # is_clean_up=True, ep=ep,train=True) print('======= epoch:%i ========' % ep) # print('total loss: {:.4f}'.format(total_loss/len(dloader))) print('++ Ep Time: {:.1f} Secs ++'.format(time.time() - t0)) # total_loss.append(epoch_loss) total_loss.append(float(epoch_loss / epoch_cases)) pd_total_loss = pd.DataFrame(total_loss) pd_total_loss.to_csv('./iou_pic/total_loss_finetune.csv', sep=',') print(total_loss)
def train(fold_idx=None): # model = UNet(n_classes=1, n_channels=3) model = DeepLabV3_plus(num_classes=1, backbone='resnet', sync_bn=True) train_dataloader, valid_dataloader = get_trainval_dataloader() criterion = nn.BCEWithLogitsLoss() optimizer = Ranger(model.parameters(), lr=1e-3, weight_decay=0.0005) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) best_val_score = 0 last_improved_epoch = 0 if fold_idx is None: print('start') model_save_path = os.path.join(config.dir_weight, '{}.bin'.format(config.save_model_name)) else: print('start fold: {}'.format(fold_idx + 1)) model_save_path = os.path.join( config.dir_weight, '{}_fold{}.bin'.format(config.save_model_name, fold_idx)) for cur_epoch in range(config.num_epochs): start_time = int(time.time()) model.train() print('epoch: ', cur_epoch + 1) cur_step = 0 for batch in train_dataloader: batch_x = batch['image'] batch_y = batch['mask'] batch_x, batch_y = batch_x.to(device), batch_y.to(device) optimizer.zero_grad() mask_pred = model(batch_x) train_loss = criterion(mask_pred, batch_y) train_loss.backward() optimizer.step() cur_step += 1 if cur_step % config.step_train_print == 0: train_acc = accuracy(mask_pred, batch_y) msg = 'the current step: {0}/{1}, train loss: {2:>5.2}, train acc: {3:>6.2%}' print( msg.format(cur_step, len(train_dataloader), train_loss.item(), train_acc[0].item())) val_miou = eval_net_unet_miou(model, valid_dataloader, device) val_score = val_miou if val_score > best_val_score: best_val_score = val_score torch.save(model.state_dict(), model_save_path) improved_str = '*' last_improved_epoch = cur_epoch else: improved_str = '' msg = 'the current epoch: {0}/{1}, val score: {3:>6.2%}, cost: {4}s {5}' end_time = int(time.time()) print( msg.format(cur_epoch + 1, config.num_epochs, val_score, end_time - start_time, improved_str)) if cur_epoch - last_improved_epoch > config.num_patience_epoch: print("No optimization for a long time, auto-stopping...") break scheduler_cosine.step() del model gc.collect() return best_val_score
def train(args): # get configs epochs = args.epoch dim = args.dim lr = args.lr weight_decay = args.l2 head_num = args.head_num aggregate = "sum" device = args.device act = args.act fusion = args.fusion beta = args.beta model = AttentionalTreeEmbeddig(leaf_num,importer_size,item_size,\ dim,head_num,\ fusion_type=fusion,act=act,device=device, ).to(device) # model = torch.load("./saved_models/DATE_0.6028.pkl").to(device) # initialize parameters for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # optimizer & loss optimizer = Ranger(model.parameters(), weight_decay=weight_decay, lr=lr) cls_loss_func = nn.BCELoss() reg_loss_func = nn.MSELoss() # save best model global_best_score = 0 model_state = None # early stop settings stop_rounds = 3 no_improvement = 0 current_score = None for epoch in range(epochs): for step, (batch_feature, batch_user, batch_item, batch_cls, batch_reg) in enumerate(train_loader): model.train() # prep to train model batch_feature,batch_user,batch_item,batch_cls,batch_reg = \ batch_feature.to(device), batch_user.to(device), batch_item.to(device),\ batch_cls.to(device), batch_reg.to(device) batch_cls, batch_reg = batch_cls.view(-1, 1), batch_reg.view(-1, 1) # model output classification_output, regression_output, hidden_vector = model( batch_feature, batch_user, batch_item) # FGM attack adv_vector = fgsm_attack(model, cls_loss_func, hidden_vector, batch_cls, 0.01) adv_output = model.pred_from_hidden(adv_vector) # calculate loss adv_loss = beta * cls_loss_func(adv_output, batch_cls) cls_loss = cls_loss_func(classification_output, batch_cls) revenue_loss = 10 * reg_loss_func(regression_output, batch_reg) loss = cls_loss + revenue_loss + adv_loss optimizer.zero_grad() loss.backward() optimizer.step() if (step + 1) % 1000 == 0: print("CLS loss:%.4f, REG loss:%.4f, ADV loss:%.4f, Loss:%.4f"\ %(cls_loss.item(),revenue_loss.item(),adv_loss.item(),loss.item())) # evaluate model.eval() print("Validate at epoch %s" % (epoch + 1)) y_prob, val_loss = model.eval_on_batch(valid_loader) y_pred_tensor = torch.tensor(y_prob).float().to(device) best_threshold, val_score, roc = torch_threshold(y_prob, xgb_validy) overall_f1, auc, precisions, recalls, f1s, revenues = metrics( y_prob, xgb_validy, revenue_valid) select_best = np.mean(f1s) print("Over-all F1:%.4f, AUC:%.4f, F1-top:%.4f" % (overall_f1, auc, select_best)) print("Evaluate at epoch %s" % (epoch + 1)) y_prob, val_loss = model.eval_on_batch(test_loader) y_pred_tensor = torch.tensor(y_prob).float().to(device) overall_f1, auc, precisions, recalls, f1s, revenues = metrics( y_prob, xgb_testy, revenue_test, best_thresh=best_threshold) print("Over-all F1:%.4f, AUC:%.4f, F1-top:%.4f" % (overall_f1, auc, np.mean(f1s))) # save best model if select_best > global_best_score: global_best_score = select_best torch.save(model, model_path) # early stopping if current_score == None: current_score = select_best continue if select_best < current_score: current_score = select_best no_improvement += 1 if no_improvement >= stop_rounds: print("Early stopping...") break if select_best > current_score: no_improvement = 0 current_score = None
if not config.amp: model = nn.DataParallel(model) if config.optim == 'adamw': optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay) elif config.optim == 'sgd': optimizer = SGD(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay, momentum=0.9, nesterov=True) elif config.optim == 'ranger': optimizer = Ranger(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay, use_gc=True) if config.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") model = nn.DataParallel(model) if config.mode == 'multimodal': scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config.n_epoch // 2, gamma=0.1, last_epoch=-1) else: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config.n_epoch * len(train_loader)) if config.warmup:
from ranger import Ranger epochs = 100 patience = 15 #opt = torch.optim.AdamW(model.parameters(), lr=3e-4) #opt = Lookahead(opt) base_lr = 3e-4 opt = Ranger(params=[{ 'params': model.head.parameters(), 'lr': base_lr }, { 'params': model.base.features.denseblock4.parameters(), 'lr': base_lr // 3 }, { 'params': model.base.features.denseblock3.parameters(), 'lr': base_lr // 5 }, { 'params': model.base.features.denseblock2.parameters(), 'lr': base_lr // 10 }, { 'params': model.base.features.denseblock1.parameters(), 'lr': base_lr // 100 }]) model = model.to(device) rolling_loss = dict(train=RollingLoss(), valid=RollingLoss()) steps = dict(train=0, valid=0) trials = 0 best_metric = -np.inf history = [] stop = False