def create_session(self, improve_by=5, min_epoch=10): self.objs['saver'] = tf.train.Saver() # self.objs['sess'] = tf.Session(config = self.session_config) self.objs['sess'] = tf.InteractiveSession() self.objs['sess'].run(tf.global_variables_initializer()) self.objs['es'] = utils.EarlyStopping(self.objs['sess'], self.objs['saver'], save_dir="saved_seed%d" % self.seed, improve_by=improve_by, min_epoch=min_epoch) if self.feature_extractor_needed: if not os.path.exists("vgg16_cifar100"): print("Pretrained model doesnt exist for VGG16") print("Run cifar100.py first") exit(0) else: reqd_variables = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="feature_extractor") feature_extractor_saver = tf.train.Saver(reqd_variables) print("Restoring feature extractor variables") feature_extractor_saver.restore(self.objs['sess'], "vgg16_cifar100/saved.ckpt") print("Done")
def optimize(self): """ Train the network. For each iteration, call the optimization loop function. """ print(colored('starting optimization with ADAM...', 'cyan')) self.optimizer = torch.optim.Adam(self.parameters, lr=self.args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='min', factor=self.args.lr_factor, threshold=self.args.lr_thresh, patience=self.args.lr_patience) # stop after no improvements greater than a certain percentage of the previous loss stopper = u.EarlyStopping(patience=self.args.earlystop_patience, min_delta=self.args.earlystop_min_delta, percentage=True) start = time() for j in range(self.args.epochs): self.optimizer.zero_grad() loss = self.optimization_loop() self.optimizer.step() if self.args.reduce_lr: scheduler.step(loss) if stopper.step(loss): # stopper is computed on loss, as we don't have any validation metrics break self.elapsed = time() - start print(colored(u.sec2time(self.elapsed), 'yellow'))
def run(fold, model_name): writer = SummaryWriter(log_dir=f'{SAVE_PATH}/', filename_suffix=f'{model_name}-fold{fold}') dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) print(df_train.shape) print(df_valid.shape) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2 ) device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu') print(f'training on {device}') model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(params.optimizer_params(model), lr=5e-5) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) es = utils.EarlyStopping(patience=5, mode="max") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, writer) jaccard = engine.eval_fn(valid_data_loader, model, device, writer) print(f"Jaccard Score = {jaccard}") print(f"Epoch={epoch}, Jaccard={jaccard}") es(jaccard, model, model_path=f"{SAVE_PATH}/{model_name}-f{fold}.pt") if es.early_stop: print("Early stopping") break
def __init__(self, model: Any, model_name: str = None): super().__init__(model) self.model_name = model_name self.device = xm.xla_device() self.optimizer = transformers.AdamW(self.model.parameters(), lr=1e-4 * xm.xrt_world_size()) self.criterion = nn.BCEWithLogitsLoss() self.early_stopping = utils.EarlyStopping(patience=5, verbose=True) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode="max", patience=5, factor=0.3, verbose=True)
def run_trial(params, grid_logger=None, grid_logger_avg=None, cuda_id=0): if not params["is_nni"]: results_logger = utils.Results(grid_logger, grid_logger_avg, params) for it in range(params['iterations']): print("Starting Trial") print(params) dataset = params['loader'] model = ModelRunner(params, dataset, cuda_device=cuda_id) model.architecture() early_stopping = utils.EarlyStopping( patience=params['early_stopping_patience'], verbose=True) for epoch in range(int(params['epochs'])): train_results = model.train(epoch) # train valid_results = model.validation(epoch) # validation if params["is_nni"]: if epoch % 1 == 0: nni.report_intermediate_result( train_results["f1_score_macro"]) if not params['is_nni']: results_logger.insert_scores(train_results=train_results, valid_results=valid_results) utils.print_log_data(train_results=train_results, valid_results=valid_results, epoch=epoch) if epoch == int(params['epochs']) - 1: test_results, best_epoch = model.test() # test if params["is_nni"]: nni.report_final_result(valid_results["f1_score_macro"]) else: results_logger.insert_scores(test_results=test_results) utils.print_log_data(train_results=train_results, valid_results=valid_results, epoch=epoch) results_logger.write_log(it, best_epoch) results_logger.write_avg_log() valid_loss = valid_results['loss'] + valid_results['tempo_loss'] early_stopping(valid_loss, model.net) if early_stopping.early_stop: print(f"Early stopping, epoch:{epoch}") break print("done")
def run(): seed_everything(config.SEED) df_train = pd.read_csv( config.TRAINING_FILE).dropna().reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") for epoch in range(EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) if epoch + 1 == MAX_EPOCHS: torch.save(model.state_dict(), 'model_full.bin') break
def run(train_path, dev_path, batch_size, device, epochs=50, path="weights/model.bin"): # Build model print('Building model ...') net = LSTM_divider(consts.voc_size) net.to(device) print('Done!') print('Building dataset ...') train_dataset = utils.Dataset(train_path, consts.CHAR2IDX) val_dataset = utils.Dataset(dev_path, consts.CHAR2IDX) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=utils.make_batch) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=utils.make_batch) print('Done!') # Class for Early Stopping es = utils.EarlyStopping() for epoch in range(1, epochs + 1): print(f'epoch {epoch}') train_fn(net, train_data_loader, device) precision = valid_fn(net, val_data_loader, device) # If score is not improved during certain term, # stop running if es.update(precision): print(f'Score has not been improved for {es.max_patient} epochs') print(f'Best precision -> {es.best}') torch.save(net.state_dict(), path) return
def load_model_weight_continue_train(self): self.build_and_set_model() assert os.path.exists("cv_model.pth") self.model_ft.load_state_dict(torch.load("cv_model.pth")) self.model_ft.eval() patience = 3 if self.submit_run: patience = 0 es = utils.EarlyStopping( patience=patience ) # the first time it become worse, if patience set to 1 for epoch in range(8, 13): train_loss = self.train_one_epoch( self.model_ft, self.optimizer, self.data_loader, self.device, epoch, self.metric_logger, print_freq=100, ) print(f"train_loss (averaged) is {train_loss}") self.lr_scheduler.step() # change learning rate if not self.submit_run: metric = self.eval_model_loss( self.model_ft, self.data_loader_dev, self.device, self.metric_logger, print_freq=100, ) print(f"metric (averaged) is {metric}") if es.step(metric): print( f"{epoch+1} epochs run and early stop, with patience {patience}" ) break
def train_model(self): patience = 3 if self.submit_run: patience = 0 es = utils.EarlyStopping( patience=patience ) # the first time it become worse, if patience set to 1 for epoch in range(self.num_epochs): train_loss = self.train_one_epoch( self.model_ft, self.optimizer, self.data_loader, self.device, epoch, self.metric_logger, print_freq=10, mq_logger=self.logger, ) self.metric_logger.print_and_log_to_file( f"train_loss (averaged) is {train_loss}") self.lr_scheduler.step() # change learning rate if not self.submit_run: metric = self.eval_model_loss( self.model_ft, self.data_loader_dev, self.device, self.metric_logger, print_freq=10, ) self.metric_logger.print_and_log_to_file( f"\nmetric (averaged) is {metric}\n") if es.step(metric): self.print_log( f"{epoch+1} epochs run and early stop, with patience {patience}" ) break
def run(fold): """ Train model for a speciied fold """ # Read training csv dfx = pd.read_csv(TRAINING_FILE) # Set train validation set split df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) print('SIZE', len(df_train), len(df_valid)) # Instantiate TweetDataset with training data train_dataset = TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values ) # Instantiate DataLoader with `train_dataset` # This is a generator that yields the dataset in batches train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4 ) # Instantiate TweetDataset with validation data valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) # Instantiate DataLoader with `valid_dataset` valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=2 ) # Set device as `cuda` (GPU) device = torch.device("cuda") # Load pretrained RoBERTa model_config = transformers.RobertaConfig.from_pretrained(ROBERTA_PATH) # Output hidden states # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers model_config.output_hidden_states = True # Instantiate our model with `model_config` model = TweetModel(conf=model_config) # Move the model to the GPU model.to(device) # Calculate the number of training steps num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS) # Get the list of named parameters param_optimizer = list(model.named_parameters()) # Specify parameters where weight decay shouldn't be applied no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # Define two sets of parameters: those with weight decay, and those without optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] # Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5 optimizer = AdamW(optimizer_parameters, lr=3e-5) # Create a scheduler to set the learning rate at each training step # "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html) # Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) # Apply early stopping with patience of 2 # This means to stop training new epochs when 2 rounds have passed without any improvement es = utils.EarlyStopping(patience=2, mode="max") # es = EarlyStopping(patience=2) print(f"Training is Starting for fold={fold}") # I'm training only for 3 epochs even though I specified 5!!! for epoch in range(EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") es(jaccard, model, model_path=SAVE_HEAD + str(fold) + '.bin') if es.early_stop: print("Early stopping") break
def train(config, fold, model_1, model_2, dict_loader, optimizer, scheduler, list_dir_save_model, dir_pyplot, Validation=True, Test_flag = True): train_loader = dict_loader['train'] val_loader = dict_loader['val'] test_loader = dict_loader['test'] """ loss """ # criterion_cls = nn.CrossEntropyLoss() # criterion_cls = ut.FocalLoss(gamma=st.focal_gamma, alpha=st.focal_alpha, size_average=True) criterion_cls = nn.BCELoss() # criterion = nn.L1Loss(reduction='mean').cuda() criterion = nn.MSELoss(reduction='mean').cuda() # criterion_gdl = gdl_loss(pNorm=2).cuda() EMS = ut.eval_metric_storage() list_selected_EMS = [] list_ES = [] for i_tmp in range(len(st.list_standard_eval_dir)): list_selected_EMS.append(ut.eval_selected_metirc_storage()) list_ES.append(ut.EarlyStopping(delta=0, patience=st.early_stopping_patience, verbose=True)) print('training') """ epoch """ ut.model_freeze(model_2, requires_grad=False) num_data = len(train_loader.dataset) for epoch in range(config.num_epochs): epoch = epoch + 1 # increase the # of the epoch print(" ") print("--------------- epoch {} ----------------".format(epoch)) torch.cuda.empty_cache() """ print learning rate """ for param_group in optimizer.param_groups: print('current LR : {}'.format(param_group['lr'])) """ batch """ for i, data_batch in enumerate(train_loader): # start = time.time() model_1.train() model_2.eval() EMS.total_train_step += 1 with torch.no_grad(): """ input""" datas = Variable(data_batch['data'].float()).cuda() # labels = Variable(data_batch['label'].long()).cuda() labels = Variable(data_batch['label'].float()).cuda() """ minmax norm""" if st.list_data_norm_type[st.data_norm_type_num] == 'minmax': tmp_datas = datas.view(datas.size(0), -1) tmp_datas -= tmp_datas.min(1, keepdim=True)[0] tmp_datas /= tmp_datas.max(1, keepdim=True)[0] datas = tmp_datas.view_as(datas) """ data augmentation """ ##TODO : flip # flip_flag_list = np.random.normal(size=datas.shape[0])>0 # datas[flip_flag_list] = datas[flip_flag_list].flip(-3) ##TODO : translation, cropping dict_result = ut.data_augmentation(datas=datas, cur_epoch=epoch) datas = dict_result['datas'] # aug_dict_result = ut.data_augmentation(datas=aug_datas, cur_epoch=epoch) # aug_datas = aug_dict_result['datas'] """ gaussain noise """ # Gaussian_dist = torch.distributions.normal.Normal(loc=torch.tensor([0.0]), scale=torch.tensor([0.01])) # Gaussian_dist = torch.distributions.normal.Normal(loc=torch.tensor([0.0]), scale=torch.FloatTensor(1).uniform_(0, 0.01)) # Gaussian_noise = Gaussian_dist.sample(datas.size()).squeeze(-1) # datas = datas + Gaussian_noise.cuda() """ model 1 forward """ dict_result = model_2(datas) output_3 = dict_result['logitMap'] """ forward propagation """ dict_result = model_1(output_3.detach()) output_1 = dict_result['logits'] output_2 = dict_result['Aux_logits'] output_3 = dict_result['logitMap'] """ classification """ loss_list_1 = [] loss_2 = criterion_cls(output_1, labels) loss_list_1.append(loss_2) EMS.train_aux_loss_1.append(loss_2.data.cpu().numpy()) loss = sum(loss_list_1) optimizer.zero_grad() loss.backward() optimizer.step() """ print the train loss and tensorboard""" if (EMS.total_train_step) % 10 == 0 : # print('time : ', time.time() - start) print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %(epoch, config.num_epochs, i + 1, (round(num_data / config.batch_size)), loss.data.cpu().numpy())) torch.cuda.empty_cache() """ pyplot """ EMS.train_loss.append(loss.data.cpu().numpy()) EMS.train_step.append(EMS.total_train_step) """ val """ if Validation == True: print("------------------ val --------------------------") dict_result = ut.eval_classification_model_2(config, fold, val_loader, model_1, model_2, criterion_cls) val_loss = dict_result['Loss'] acc = dict_result['Acc'] auc = dict_result['AUC'] print('Fold : %d, Epoch [%d/%d] val Loss = %f val Acc = %f' % (fold, epoch, config.num_epochs, val_loss, acc)) torch.cuda.empty_cache() """ save the metric """ EMS.dict_val_metric['val_loss'].append(val_loss) EMS.dict_val_metric['val_acc'].append(acc) EMS.dict_val_metric['val_auc'].append(auc) EMS.val_step.append(EMS.total_train_step) """ save model """ for i_tmp in range(len(list_selected_EMS)): save_flag = ut.model_save_through_validation(fold, epoch, EMS=EMS, selected_EMS=list_selected_EMS[i_tmp], ES=list_ES[i_tmp], model=model_1, dir_save_model=list_dir_save_model[i_tmp], metric_1=st.list_standard_eval[i_tmp], metric_2='', save_flag=False) if Test_flag== True: print("------------------ test _ test dataset --------------------------") """ load data """ dict_result = ut.eval_classification_model_2(config, fold, test_loader, model_1, model_2, criterion_cls) test_loss = dict_result['Loss'] acc = dict_result['Acc'] test_loss = dict_result['Loss'] """ pyplot """ EMS.test_acc.append(acc) EMS.test_loss.append(test_loss) EMS.test_step.append(EMS.total_train_step) print('number of test samples : {}'.format(len(test_loader.dataset))) print('Fold : %d, Epoch [%d/%d] test Loss = %f test Acc = %f' % (fold, epoch, config.num_epochs, test_loss, acc)) torch.cuda.empty_cache() """ learning rate decay""" EMS.LR.append(optimizer.param_groups[0]['lr']) scheduler.step() # scheduler.step(val_loss) """ plot the chat """ if epoch % 10 == 0: ut.plot_training_info_1(fold, dir_pyplot, EMS, flag='percentile', flag_match=False) ##TODO : early stop only if all of metric has been stopped tmp_count = 0 for i in range(len(list_ES)): if list_ES[i].early_stop == True: tmp_count += 1 if tmp_count == len(list_ES): break """ release the model """ del model_1, EMS torch.cuda.empty_cache()
def train(model, train_data, val_data, args): model.train() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) criterion = nn.BCEWithLogitsLoss() num_iterations_per_epoch = len(train_data) / args.batch_size val_eval_freq = int(args.val_evaluation_freq * num_iterations_per_epoch) print( f"Val set evaluated every {val_eval_freq:,} steps (approx. {args.val_evaluation_freq} epoch)" ) es = utils.EarlyStopping(args.early_stopping_patience) initial_time = time.time() train_dataloader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) global_step = 0 epoch_no = 0 while True: print(f"EPOCH #{epoch_no+1}") # Train single epoch for batch in train_dataloader: headlines, headline_lengths, bodys, para_lengths, labels = tuple( b.to(device) for b in batch) optimizer.zero_grad() preds = model(headlines, headline_lengths, bodys, para_lengths) loss = criterion(preds, labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() global_step += 1 print("globstep:", global_step) if global_step % val_eval_freq == 0: # Evaluate on validation set val_loss, val_acc, val_auc = evaluate(model, val_data) model.train() end_time = time.time() minutes_elapsed = int((end_time - initial_time) / 60) print( "STEP: {:7} | TIME: {:4}min | VAL LOSS: {:.4f} | VAL ACC: {:.4f} | VAL AUROC: {:.4f}" .format(global_step, minutes_elapsed, val_loss, val_acc, val_auc)) # Check early stopping if global_step >= args.min_iterations: es.record_loss(val_loss, model) if es.should_stop(): print(f"Early stopping at STEP: {global_step}...") return if global_step == args.max_iterations: print( f"Stopping after reaching max iterations({global_step})..." ) return epoch_no += 1
def train(config, fold, model, dict_loader, optimizer, scheduler, list_dir_save_model, dir_pyplot, Validation=True, Test_flag=True): train_loader = dict_loader['train'] val_loader = dict_loader['val'] test_loader = dict_loader['test'] """ loss """ # criterion_cls = nn.CrossEntropyLoss() # criterion_cls = ut.FocalLoss(gamma=st.focal_gamma, alpha=st.focal_alpha, size_average=True) # kdloss = ut.KDLoss(4.0) criterion_KL = nn.KLDivLoss(reduction="sum") criterion_cls = nn.BCELoss() # criterion_L1 = nn.L1Loss(reduction='sum').cuda() # criterion_L2 = nn.MSELoss(reduction='mean').cuda() # criterion_gdl = gdl_loss(pNorm=2).cuda() EMS = ut.eval_metric_storage() list_selected_EMS = [] list_ES = [] for i_tmp in range(len(st.list_standard_eval_dir)): list_selected_EMS.append(ut.eval_selected_metirc_storage()) list_ES.append( ut.EarlyStopping(delta=0, patience=st.early_stopping_patience, verbose=True)) loss_tmp = [0] * 5 loss_tmp_total = 0 print('training') optimizer.zero_grad() optimizer.step() """ epoch """ num_data = len(train_loader.dataset) for epoch in range(1, config.num_epochs + 1): scheduler.step() print(" ") print("--------------- epoch {} ----------------".format(epoch)) """ print learning rate """ for param_group in optimizer.param_groups: print('current LR : {}'.format(param_group['lr'])) """ batch """ for i, data_batch in enumerate(train_loader): # start = time.time() model.train() with torch.no_grad(): """ input""" datas = Variable(data_batch['data'].float()).cuda() # labels = Variable(data_batch['label'].long()).cuda() labels = Variable(data_batch['label'].float()).cuda() """ data augmentation """ ##TODO : flip # flip_flag_list = np.random.normal(size=datas.shape[0])>0 # datas[flip_flag_list] = datas[flip_flag_list].flip(-3) ##TODO : translation, cropping dict_result = ut.data_augmentation(datas=datas, cur_epoch=epoch) datas = dict_result['datas'] translation_list = dict_result['translation_list'] # aug_dict_result = ut.data_augmentation(datas=aug_datas, cur_epoch=epoch) # aug_datas = aug_dict_result['datas'] """ minmax norm""" if st.list_data_norm_type[st.data_norm_type_num] == 'minmax': tmp_datas = datas.view(datas.size(0), -1) tmp_datas -= tmp_datas.min(1, keepdim=True)[0] tmp_datas /= tmp_datas.max(1, keepdim=True)[0] datas = tmp_datas.view_as(datas) """ gaussain noise """ # Gaussian_dist = torch.distributions.normal.Normal(loc=torch.tensor([0.0]), scale=torch.tensor([0.01])) # Gaussian_dist = torch.distributions.normal.Normal(loc=torch.tensor([0.0]), scale=torch.FloatTensor(1).uniform_(0, 0.01)) # Gaussian_noise = Gaussian_dist.sample(datas.size()).squeeze(-1) # datas = datas + Gaussian_noise.cuda() """ forward propagation """ dict_result = model(datas, translation_list) output_1 = dict_result['logits'] output_2 = dict_result['Aux_logits'] output_3 = dict_result['logitMap'] output_4 = dict_result['l1_norm'] # loss_list_1 = [] count_loss = 0 if fst.flag_loss_1 == True: s_labels = ut.smooth_one_hot(labels, config.num_classes, smoothing=st.smoothing_img) loss_2 = criterion_cls( output_1, s_labels) * st.lambda_major[0] / st.iter_to_update loss_list_1.append(loss_2) loss_tmp[count_loss] += loss_2.data.cpu().numpy() if (EMS.total_train_iter + 1) % st.iter_to_update == 0: EMS.train_aux_loss[count_loss].append(loss_tmp[count_loss]) loss_tmp[count_loss] = 0 count_loss += 1 if fst.flag_loss_2 == True: for i_tmp in range(len(output_2)): s_labels = ut.smooth_one_hot(labels, config.num_classes, smoothing=st.smoothing_roi) loss_2 = criterion_cls( output_2[i_tmp], s_labels) * st.lambda_aux[i_tmp] / st.iter_to_update loss_list_1.append(loss_2) loss_tmp[count_loss] += loss_2.data.cpu().numpy() if (EMS.total_train_iter + 1) % st.iter_to_update == 0: EMS.train_aux_loss[count_loss].append( loss_tmp[count_loss]) loss_tmp[count_loss] = 0 count_loss += 1 if fst.flag_loss_3 == True: # patch list_loss_tmp = [] for tmp_j in range(len(output_4)): # type i.e., patch, roi loss_2 = 0 for tmp_i in range(len(output_4[tmp_j])): # batch tmp_shape = output_4[tmp_j][tmp_i].shape logits = output_4[tmp_j][tmp_i].view( tmp_shape[0], tmp_shape[1], -1) # loss_2 += torch.norm(logits, p=1) loss_2 += torch.norm(logits, p=1) / (logits.view(-1).size(0)) list_loss_tmp.append( (loss_2 / len(output_4[tmp_j]) * st.l1_reg_norm) / st.iter_to_update) loss_list_1.append(sum(list_loss_tmp)) loss_tmp[count_loss] += sum(list_loss_tmp).data.cpu().numpy() if (EMS.total_train_iter + 1) % st.iter_to_update == 0: EMS.train_aux_loss[count_loss].append(loss_tmp[count_loss]) loss_tmp[count_loss] = 0 count_loss += 1 """ L1 reg""" # norm = torch.FloatTensor([0]).cuda() # for parameter in model.parameters(): # norm += torch.norm(parameter, p=1) # loss_list_1.append(norm * st.l1_reg) loss = sum(loss_list_1) loss.backward() torch.cuda.empty_cache() loss_tmp_total += loss.data.cpu().numpy() #TODO : optimize the model param if (EMS.total_train_iter + 1) % st.iter_to_update == 0: optimizer.step() optimizer.zero_grad() """ pyplot """ EMS.total_train_step += 1 EMS.train_step.append(EMS.total_train_step) EMS.train_loss.append(loss_tmp_total) """ print the train loss and tensorboard""" if (EMS.total_train_step) % 10 == 0: # print('time : ', time.time() - start) print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch, config.num_epochs, (i + 1), (num_data // (config.batch_size)), loss_tmp_total)) loss_tmp_total = 0 EMS.total_train_iter += 1 # scheduler.step(epoch + i / len(train_loader)) """ val """ if Validation == True: print("------------------ val --------------------------") if fst.flag_cropping == True and fst.flag_eval_cropping == True: dict_result = ut.eval_classification_model_cropped_input( config, fold, val_loader, model, criterion_cls) elif fst.flag_translation == True and fst.flag_eval_translation == True: dict_result = ut.eval_classification_model_esemble( config, fold, val_loader, model, criterion_cls) elif fst.flag_MC_dropout == True: dict_result = ut.eval_classification_model_MC_dropout( config, fold, val_loader, model, criterion_cls) else: dict_result = ut.eval_classification_model( config, fold, val_loader, model, criterion_cls) val_loss = dict_result['Loss'] acc = dict_result['Acc'] auc = dict_result['AUC'] print('Fold : %d, Epoch [%d/%d] val Loss = %f val Acc = %f' % (fold, epoch, config.num_epochs, val_loss, acc)) """ save the metric """ EMS.dict_val_metric['val_loss'].append(val_loss) EMS.dict_val_metric['val_acc'].append(acc) if fst.flag_loss_2 == True: for tmp_i in range(len(st.lambda_aux)): EMS.dict_val_metric['val_acc_aux'][tmp_i].append( dict_result['Acc_aux'][tmp_i]) EMS.dict_val_metric['val_auc'].append(auc) EMS.val_step.append(EMS.total_train_step) n_stacking_loss_for_selection = 5 if len(EMS.dict_val_metric['val_loss_queue'] ) > n_stacking_loss_for_selection: EMS.dict_val_metric['val_loss_queue'].popleft() EMS.dict_val_metric['val_loss_queue'].append(val_loss) EMS.dict_val_metric['val_mean_loss'].append( np.mean(EMS.dict_val_metric['val_loss_queue'])) """ save model """ for i_tmp in range(len(list_selected_EMS)): save_flag = ut.model_save_through_validation( fold, epoch, EMS=EMS, selected_EMS=list_selected_EMS[i_tmp], ES=list_ES[i_tmp], model=model, dir_save_model=list_dir_save_model[i_tmp], metric_1=st.list_standard_eval[i_tmp], metric_2='', save_flag=False) if Test_flag == True: print( "------------------ test _ test dataset --------------------------" ) """ load data """ if fst.flag_cropping == True and fst.flag_eval_cropping == True: print("eval : cropping") dict_result = ut.eval_classification_model_cropped_input( config, fold, test_loader, model, criterion_cls) elif fst.flag_translation == True and fst.flag_eval_translation == True: print("eval : assemble") dict_result = ut.eval_classification_model_esemble( config, fold, test_loader, model, criterion_cls) elif fst.flag_MC_dropout == True: dict_result = ut.eval_classification_model_MC_dropout( config, fold, test_loader, model, criterion_cls) else: print("eval : whole image") dict_result = ut.eval_classification_model( config, fold, test_loader, model, criterion_cls) acc = dict_result['Acc'] test_loss = dict_result['Loss'] """ pyplot """ EMS.test_acc.append(acc) if fst.flag_loss_2 == True: for tmp_i in range(len(st.lambda_aux)): EMS.test_acc_aux[tmp_i].append( dict_result['Acc_aux'][tmp_i]) EMS.test_loss.append(test_loss) EMS.test_step.append(EMS.total_train_step) print('number of test samples : {}'.format(len( test_loader.dataset))) print('Fold : %d, Epoch [%d/%d] test Loss = %f test Acc = %f' % (fold, epoch, config.num_epochs, test_loss, acc)) """ learning rate decay""" EMS.LR.append(optimizer.param_groups[0]['lr']) # scheduler.step() # scheduler.step(val_loss) """ plot the chat """ if epoch % 1 == 0: ut.plot_training_info_1(fold, dir_pyplot, EMS, flag='percentile', flag_match=False) ##TODO : early stop only if all of metric has been stopped tmp_count = 0 for i in range(len(list_ES)): if list_ES[i].early_stop == True: tmp_count += 1 if tmp_count == len(list_ES): break """ release the model """ del model, EMS torch.cuda.empty_cache()
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained( config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") print(f"Training is Starting for fold={fold}") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) #print(f"Jaccard Score = {jaccard}") es(jaccard, model, model_path=f"model_{fold}.bin") if es.early_stop: print("Early stopping") break
def train(): # Check NNabla version if utils.get_nnabla_version_integer() < 11900: raise ValueError( 'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0' ) parser, args = get_train_args() # Get context. ctx = get_extension_context(args.context, device_id=args.device_id) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) ext = import_extension_module(args.context) # Monitors # setting up monitors for logging monitor_path = args.output monitor = Monitor(monitor_path) monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1) monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1) monitor_validation_loss = MonitorSeries('Validation loss', monitor, interval=1) monitor_lr = MonitorSeries('learning rate', monitor, interval=1) monitor_time = MonitorTimeElapsed("training time per iteration", monitor, interval=1) if comm.rank == 0: print("Mixing coef. is {}, i.e., MDL = {}*TD-Loss + FD-Loss".format( args.mcoef, args.mcoef)) if not os.path.isdir(args.output): os.makedirs(args.output) # Initialize DataIterator for MUSDB. train_source, valid_source, args = load_datasources(parser, args) train_iter = data_iterator(train_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) valid_iter = data_iterator(valid_source, 1, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) if comm.n_procs > 1: train_iter = train_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) valid_iter = valid_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) # Calculate maxiter per GPU device. max_iter = int((train_source._size // args.batch_size) // comm.n_procs) weight_decay = args.weight_decay * comm.n_procs print("max_iter", max_iter) # Calculate the statistics (mean and variance) of the dataset scaler_mean, scaler_std = utils.get_statistics(args, train_source) max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft, args.bandwidth) unmix = OpenUnmix_CrossNet(input_mean=scaler_mean, input_scale=scaler_std, nb_channels=args.nb_channels, hidden_size=args.hidden_size, n_fft=args.nfft, n_hop=args.nhop, max_bin=max_bin) # Create input variables. mixture_audio = nn.Variable([args.batch_size] + list(train_source._get_data(0)[0].shape)) target_audio = nn.Variable([args.batch_size] + list(train_source._get_data(0)[1].shape)) vmixture_audio = nn.Variable( [1] + [2, valid_source.sample_rate * args.valid_dur]) vtarget_audio = nn.Variable([1] + [8, valid_source.sample_rate * args.valid_dur]) # create training graph mix_spec, M_hat, pred = unmix(mixture_audio) Y = Spectrogram(*STFT(target_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) loss_f = mse_loss(mix_spec, M_hat, Y) loss_t = sdr_loss(mixture_audio, pred, target_audio) loss = args.mcoef * loss_t + loss_f loss.persistent = True # Create Solver and set parameters. solver = S.Adam(args.lr) solver.set_parameters(nn.get_parameters()) # create validation graph vmix_spec, vM_hat, vpred = unmix(vmixture_audio, test=True) vY = Spectrogram(*STFT(vtarget_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) vloss_f = mse_loss(vmix_spec, vM_hat, vY) vloss_t = sdr_loss(vmixture_audio, vpred, vtarget_audio) vloss = args.mcoef * vloss_t + vloss_f vloss.persistent = True # Initialize Early Stopping es = utils.EarlyStopping(patience=args.patience) # Initialize LR Scheduler (ReduceLROnPlateau) lr_scheduler = ReduceLROnPlateau(lr=args.lr, factor=args.lr_decay_gamma, patience=args.lr_decay_patience) best_epoch = 0 # Training loop. for epoch in trange(args.epochs): # TRAINING losses = utils.AverageMeter() for batch in range(max_iter): mixture_audio.d, target_audio.d = train_iter.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() loss.backward(clear_buffer=True, communicator_callbacks=all_reduce_callback) else: loss.backward(clear_buffer=True) solver.weight_decay(weight_decay) solver.update() losses.update(loss.d.copy(), args.batch_size) training_loss = losses.avg # clear cache memory ext.clear_memory_cache() # VALIDATION vlosses = utils.AverageMeter() for batch in range(int(valid_source._size // comm.n_procs)): x, y = valid_iter.next() dur = int(valid_source.sample_rate * args.valid_dur) sp, cnt = 0, 0 loss_tmp = nn.NdArray() loss_tmp.zero() while 1: vmixture_audio.d = x[Ellipsis, sp:sp + dur] vtarget_audio.d = y[Ellipsis, sp:sp + dur] vloss.forward(clear_no_need_grad=True) cnt += 1 sp += dur loss_tmp += vloss.data if x[Ellipsis, sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur: break loss_tmp = loss_tmp / cnt if comm.n_procs > 1: comm.all_reduce(loss_tmp, division=True, inplace=True) vlosses.update(loss_tmp.data.copy(), 1) validation_loss = vlosses.avg # clear cache memory ext.clear_memory_cache() lr = lr_scheduler.update_lr(validation_loss, epoch=epoch) solver.set_learning_rate(lr) stop = es.step(validation_loss) if comm.rank == 0: monitor_best_epoch.add(epoch, best_epoch) monitor_traing_loss.add(epoch, training_loss) monitor_validation_loss.add(epoch, validation_loss) monitor_lr.add(epoch, lr) monitor_time.add(epoch) if validation_loss == es.best: # save best model nn.save_parameters(os.path.join(args.output, 'best_xumx.h5')) best_epoch = epoch if stop: print("Apply Early Stopping") break
def train(): parser, args = get_args() # Get context. ctx = get_extension_context(args.context, device_id=args.device_id) nn.set_default_context(ctx) # Initialize DataIterator for MNIST. train_source, valid_source, args = data.load_datasources( parser, args, rng=RandomState(42)) train_iter = data_iterator(train_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) valid_iter = data_iterator(valid_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) scaler_mean, scaler_std = get_statistics(args, train_source) max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft, args.bandwidth) unmix = model.OpenUnmix(input_mean=scaler_mean, input_scale=scaler_std, nb_channels=args.nb_channels, hidden_size=args.hidden_size, n_fft=args.nfft, n_hop=args.nhop, max_bin=max_bin, sample_rate=train_source.sample_rate) # Create input variables. audio_shape = [args.batch_size] + list(train_source._get_data(0)[0].shape) mixture_audio = nn.Variable(audio_shape) target_audio = nn.Variable(audio_shape) vmixture_audio = nn.Variable(audio_shape) vtarget_audio = nn.Variable(audio_shape) # create train graph pred_spec = unmix(mixture_audio, test=False) pred_spec.persistent = True target_spec = model.Spectrogram(*model.STFT(target_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) loss = F.mean(F.squared_error(pred_spec, target_spec), axis=1) # Create Solver. solver = S.Adam(args.lr) solver.set_parameters(nn.get_parameters()) # Training loop. t = tqdm.trange(1, args.epochs + 1, disable=args.quiet) es = utils.EarlyStopping(patience=args.patience) for epoch in t: # TRAINING t.set_description("Training Epoch") b = tqdm.trange(0, train_source._size // args.batch_size, disable=args.quiet) losses = utils.AverageMeter() for batch in b: mixture_audio.d, target_audio.d = train_iter.next() b.set_description("Training Batch") solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.update() losses.update(loss.d.copy().mean()) b.set_postfix(train_loss=losses.avg) # VALIDATION vlosses = utils.AverageMeter() for batch in range(valid_source._size): # Create new validation input variables for every batch vmixture_audio.d, vtarget_audio.d = valid_iter.next() # create validation graph vpred_spec = unmix(vmixture_audio, test=True) vpred_spec.persistent = True vtarget_spec = model.Spectrogram(*model.STFT(vtarget_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) vloss = F.mean(F.squared_error(vpred_spec, vtarget_spec), axis=1) vloss.forward(clear_buffer=True) vlosses.update(vloss.d.copy().mean()) t.set_postfix(train_loss=losses.avg, val_loss=vlosses.avg) stop = es.step(vlosses.avg) is_best = vlosses.avg == es.best # save current model nn.save_parameters( os.path.join(args.output, 'checkpoint_%s.h5' % args.target)) if is_best: best_epoch = epoch nn.save_parameters(os.path.join(args.output, '%s.h5' % args.target)) if stop: print("Apply Early Stopping") break
def retrain(args): # load dataset g_homo, g_list, pairs, labels, train_mask, val_mask, test_mask = u.load_data( args['name'], args['train_size']) # transfer pairs = t.from_numpy(pairs).to(args['device']) labels = t.from_numpy(labels).to(args['device']) train_mask = t.from_numpy(train_mask).to(args['device']) val_mask = t.from_numpy(val_mask).to(args['device']) test_mask = t.from_numpy(test_mask).to(args['device']) feat1 = t.randn(g_homo.number_of_nodes(), args['in_feats']).to(args['device']) feat2 = t.randn(g_list[0].number_of_nodes(), args['in_feats']).to(args['device']) labels = labels.view(-1, 1).to(dtype=t.float32) # model if args['model'] == 'SRG': model = m.SRG(rgcn_in_feats=args['in_feats'], rgcn_out_feats=args['embedding_size'], rgcn_num_blocks=args['num_b'], rgcn_dropout=0., han_num_meta_path=args['num_meta_path'], han_in_feats=args['in_feats'], han_hidden_feats=args['embedding_size'], han_head_list=args['head_list'], han_dropout=args['drop_out'], fc_hidden_feats=args['fc_units'] ).to(args['device']) elif args['model'] == 'SRG_GAT': model = m.SRG_GAT(rgcn_in_feats=args['in_feats'], rgcn_out_feats=args['embedding_size'], rgcn_num_blocks=args['num_b'], rgcn_dropout=args['drop_out'], han_num_meta_path=args['num_meta_path'], han_in_feats=args['in_feats'], han_hidden_feats=args['embedding_size'], han_head_list=args['head_list'], han_dropout=args['drop_out'], fc_hidden_feats=args['fc_units'] ).to(args['device']) elif args['model'] == 'SRG_no_GRU': model = m.SRG_no_GRU(gcn_in_feats=args['in_feats'], gcn_out_feats=args['embedding_size'], gcn_num_layers=args['num_l'], han_num_meta_path=args['num_meta_path'], han_in_feats=args['in_feats'], han_hidden_feats=args['embedding_size'], han_head_list=args['head_list'], han_dropout=args['drop_out'], fc_hidden_feats=args['fc_units'] ).to(args['device']) elif args['model'] == 'SRG_Res': model = m.SRG_Res(gcn_in_feats=args['in_feats'], gcn_out_feats=args['embedding_size'], gcn_num_layers=args['num_l'], han_num_meta_path=args['num_meta_path'], han_in_feats=args['in_feats'], han_hidden_feats=args['embedding_size'], han_head_list=args['head_list'], han_dropout=args['drop_out'], fc_hidden_feats=args['fc_units'] ).to(args['device']) elif args['model'] == 'SRG_no_GCN': model = m.SRG_no_GCN(han_num_meta_path=args['num_meta_path'], han_in_feats=args['in_feats'], han_hidden_feats=args['embedding_size'], han_head_list=args['head_list'], han_dropout=args['drop_out'], fc_hidden_feats=args['fc_units'] ).to(args['device']) else: raise ValueError('wrong name of the model') model.load_state_dict(t.load(args['model_path'])) # log log = [] mae, rmse = u.evaluate(model, g_homo, feat1, g_list, feat2, pairs, labels, val_mask) early_stop = u.EarlyStopping( args['model_path'], patience=args['patience'], rmse=rmse, mae=mae) # loss, optimizer loss_func = t.nn.MSELoss() optimizer = t.optim.Adam( model.parameters(), lr=args['lr'], weight_decay=args['decay']) # train for epoch in range(args['epochs']): dt = datetime.now() model.train() y_pred = model(g_homo, feat1, g_list, feat2, pairs) loss = loss_func(y_pred[train_mask], labels[train_mask]) loss.backward() optimizer.step() optimizer.zero_grad() train_mae, train_rmse = u.metrics( y_pred[train_mask].detach(), labels[train_mask]) val_mae, val_rmse = u.evaluate( model, g_homo, feat1, g_list, feat2, pairs, labels, val_mask) stop = early_stop.step(val_rmse, val_mae, model) elapse = str(datetime.now() - dt)[:10] + '\n' log.append(' '.join(str(x) for x in (epoch, train_mae, train_rmse, val_mae, val_rmse, elapse))) print(f'epoch={epoch} | train_MAE={train_mae} | train_RMSE={train_rmse} | val_MAE={val_mae} | val_RMSE={val_rmse} | elapse={elapse}') if stop: break early_stop.load_checkpoint(model) test_mae, test_rmse = u.evaluate( model, g_homo, feat1, g_list, feat2, pairs, labels, test_mask) print(f'test_MAE={test_mae} | test_RMSE={test_rmse}') # save log with open(args['log_path'], 'a') as f: f.writelines(log)
def main(): parser = argparse.ArgumentParser(description='Open Unmix Trainer') # Loss parameters parser.add_argument('--loss', type=str, default="L2freq", choices=[ 'L2freq', 'L1freq', 'L2time', 'L1time', 'L2mask', 'L1mask', 'SISDRtime', 'SISDRfreq', 'MinSNRsdsdr', 'CrossEntropy', 'BinaryCrossEntropy', 'LogL2time', 'LogL1time', 'LogL2freq', 'LogL1freq', 'PSA', 'SNRPSA', 'Dissimilarity' ], help='kind of loss used during training') # Dataset paramaters parser.add_argument('--dataset', type=str, default="musdb", choices=[ 'musdb', 'aligned', 'sourcefolder', 'trackfolder_var', 'trackfolder_fix' ], help='Name of the dataset.') parser.add_argument('--root', type=str, help='root path of dataset') parser.add_argument('--output', type=str, default="open-unmix", help='provide output path base folder name') parser.add_argument('--model', type=str, help='Path to checkpoint folder') # Trainig Parameters parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--reduce-samples', type=int, default=1, help="reduce training samples by factor n") parser.add_argument('--batch-size', type=int, default=16) parser.add_argument('--lr', type=float, default=0.001, help='learning rate, defaults to 1e-3') parser.add_argument( '--patience', type=int, default=140, help='maximum number of epochs to train (default: 140)') parser.add_argument('--lr-decay-patience', type=int, default=80, help='lr decay patience for plateau scheduler') parser.add_argument('--lr-decay-gamma', type=float, default=0.3, help='gamma of learning rate scheduler decay') parser.add_argument('--weight-decay', type=float, default=0.00001, help='weight decay') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') # Model Parameters parser.add_argument('--seq-dur', type=float, default=6.0, help='Sequence duration in seconds' 'value of <=0.0 will use full/variable length') parser.add_argument( '--unidirectional', action='store_true', default=False, help='Use unidirectional LSTM instead of bidirectional') parser.add_argument('--nfft', type=int, default=4096, help='STFT fft size and window size') parser.add_argument('--nhop', type=int, default=1024, help='STFT hop size') parser.add_argument( '--hidden-size', type=int, default=512, help='hidden size parameter of dense bottleneck layers') parser.add_argument('--bandwidth', type=int, default=16000, help='maximum model bandwidth in herz') parser.add_argument('--nb-channels', type=int, default=2, help='set number of channels for model (1, 2)') parser.add_argument('--nb-workers', type=int, default=0, help='Number of workers for dataloader.') # Misc Parameters parser.add_argument('--quiet', action='store_true', default=False, help='less verbose during training') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') args, _ = parser.parse_known_args() use_cuda = not args.no_cuda and torch.cuda.is_available() print("Using GPU:", use_cuda) dataloader_kwargs = { 'num_workers': args.nb_workers, 'pin_memory': True } if use_cuda else {} repo_dir = os.path.abspath(os.path.dirname(__file__)) repo = Repo(repo_dir) commit = repo.head.commit.hexsha[:7] # use jpg or npy torch.manual_seed(args.seed) random.seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") train_dataset, valid_dataset, args = data.load_datasets(parser, args) num_train = len(train_dataset) indices = list(range(num_train)) # shuffle train indices once and for all np.random.seed(args.seed) np.random.shuffle(indices) if args.reduce_samples > 1: split = int(np.floor(num_train / args.reduce_samples)) train_idx = indices[:split] else: train_idx = indices sampler = SubsetRandomSampler(train_idx) # create output dir if not exist target_path = Path(args.output) target_path.mkdir(parents=True, exist_ok=True) train_sampler = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=sampler, **dataloader_kwargs) stats_sampler = torch.utils.data.DataLoader(train_dataset, batch_size=1, sampler=sampler, **dataloader_kwargs) valid_sampler = torch.utils.data.DataLoader(valid_dataset, batch_size=1, **dataloader_kwargs) if args.model: scaler_mean = None scaler_std = None else: scaler_mean, scaler_std = get_statistics(args, stats_sampler) max_bin = utils.bandwidth_to_max_bin(train_dataset.sample_rate, args.nfft, args.bandwidth) # SNRPSA: de-compress the scaler in order to avoid an exploding gradient from the uncompressed initial statistics if args.loss == 'SNRPSA': power = 2 else: power = 1 unmix = model.OpenUnmixSingle( n_fft=4096, n_hop=1024, input_is_spectrogram=False, hidden_size=args.hidden_size, nb_channels=args.nb_channels, sample_rate=train_dataset.sample_rate, nb_layers=3, input_mean=scaler_mean, input_scale=scaler_std, max_bin=max_bin, unidirectional=args.unidirectional, power=power, ).to(device) print('learning rate:') print(args.lr) optimizer = torch.optim.Adam(unmix.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=args.lr_decay_gamma, patience=args.lr_decay_patience, cooldown=10) es = utils.EarlyStopping(patience=args.patience) # if a model is specified: resume training if args.model: print('LOADING MODEL') model_path = Path(args.model).expanduser() with open(Path(model_path, str(len(args.targets)) + '.json'), 'r') as stream: results = json.load(stream) target_model_path = Path(model_path, "model.chkpnt") checkpoint = torch.load(target_model_path, map_location=device) unmix.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) # train for another epochs_trained t = tqdm.trange(results['epochs_trained'], results['epochs_trained'] + args.epochs + 1, disable=args.quiet) train_losses = results['train_loss_history'] valid_losses = results['valid_loss_history'] train_times = results['train_time_history'] best_epoch = results['best_epoch'] es.best = results['best_loss'] es.num_bad_epochs = results['num_bad_epochs'] print('Model loaded') # else start from 0 else: t = tqdm.trange(1, args.epochs + 1, disable=args.quiet) train_losses = [] valid_losses = [] train_times = [] best_epoch = 0 for epoch in t: t.set_description("Training Epoch") end = time.time() train_loss = train(args, unmix, device, train_sampler, optimizer) valid_loss = valid(args, unmix, device, valid_sampler) scheduler.step(valid_loss) train_losses.append(train_loss) valid_losses.append(valid_loss) t.set_postfix(train_loss=train_loss, val_loss=valid_loss) stop = es.step(valid_loss) if valid_loss == es.best: best_epoch = epoch utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': unmix.state_dict(), 'best_loss': es.best, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, is_best=valid_loss == es.best, path=target_path, ) # save params params = { 'epochs_trained': epoch, 'args': vars(args), 'best_loss': es.best, 'best_epoch': best_epoch, 'train_loss_history': train_losses, 'valid_loss_history': valid_losses, 'train_time_history': train_times, 'num_bad_epochs': es.num_bad_epochs, 'commit': commit } with open(Path(target_path, str(len(args.targets)) + '.json'), 'w') as outfile: outfile.write(json.dumps(params, indent=4, sort_keys=True)) train_times.append(time.time() - end) if stop: print("Apply Early Stopping") break
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_func, metrics, epochs, model_dir, lr_scheduler, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified train_loss_list, val_loss_list = [], [] early_stopping = utils.EarlyStopping(patience=20, verbose=True) if restore_file is not None: restore_path = os.path.join(model_dir, restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_f1 = 0.0 # 可以替换成acc for epoch in range(epochs): logging.info("lr = {}".format(lr_scheduler.get_last_lr())) logging.info("Epoch {}/{}".format(epoch + 1, epochs)) train_loss = train(model, optimizer, loss_func, train_dataloader, metrics, lr_scheduler) val_metircs = evaluate(model, loss_func, val_dataloader, metrics) # rmse_record.append(val_metircs['rmse']) val_loss = val_metircs['loss'] # loss_result_list.append((train_loss,val_loss)) train_loss_list.append(train_loss) val_loss_list.append(val_loss) val_f1 = val_metircs['acc'] is_best = val_f1 >= best_val_f1 utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) if is_best: logging.info("- Found new best accuracy") best_val_f1 = val_f1 best_json_path = os.path.join(model_dir, "val_acc_best_weights.json") utils.save_dict_to_json(val_metircs, best_json_path) last_json_path = os.path.join(model_dir, "val_acc_last_weights.json") utils.save_dict_to_json(val_metircs, last_json_path) early_stopping(val_loss, model) if early_stopping.early_stop: logging.info("Early stopping!") break # return rmse_record return {"train_loss": train_loss_list, "val_loss": val_loss_list}
def main(): parser = argparse.ArgumentParser(description='Open Unmix Trainer') # which target do we want to train? # ============================================================================= # parser.add_argument('--target', type=str, default='vocals', # help='target source (will be passed to the dataset)') # # ============================================================================= parser.add_argument('--target', type=str, default='tabla', help='target source (will be passed to the dataset)') # Dataset paramaters parser.add_argument('--dataset', type=str, default="aligned", choices=[ 'musdb', 'aligned', 'sourcefolder', 'trackfolder_var', 'trackfolder_fix' ], help='Name of the dataset.') parser.add_argument('--root', type=str, help='root path of dataset', default='../rec_data_final/') parser.add_argument('--output', type=str, default="../new_models/model_tabla_mtl_ourmix_1", help='provide output path base folder name') #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default='../out_unmix/model_new_data_aug_tabla_mse_pretrain1') #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default="../out_unmix/model_new_data_aug_tabla_mse_pretrain8" ) #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default='../out_unmix/model_new_data_aug_tabla_bce_finetune2') parser.add_argument('--model', type=str, help='Path to checkpoint folder') #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default='umxhq') parser.add_argument( '--onset-model', type=str, help='Path to onset detection model weights', default= "/media/Sharedata/rohit/cnn-onset-det/models/apr4/saved_model_0_80mel-0-16000_1ch_44100.pt" ) # Trainig Parameters parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--batch-size', type=int, default=16) parser.add_argument('--lr', type=float, default=0.001, help='learning rate, defaults to 1e-3') parser.add_argument( '--patience', type=int, default=140, help='maximum number of epochs to train (default: 140)') parser.add_argument('--lr-decay-patience', type=int, default=80, help='lr decay patience for plateau scheduler') parser.add_argument('--lr-decay-gamma', type=float, default=0.3, help='gamma of learning rate scheduler decay') parser.add_argument('--weight-decay', type=float, default=0.00001, help='weight decay') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument('--gamma', type=float, default=0.0, help='weighting of different loss components') parser.add_argument( '--finetune', type=int, default=0, help= 'If true(1), then optimiser states from checkpoint model are reset (required for bce finetuning), false if aim is to resume training from where it was left off' ) parser.add_argument('--onset-thresh', type=float, default=0.3, help='Threshold above which onset is said to occur') parser.add_argument( '--binarise', type=int, default=0, help= 'If=1(true), then target novelty function is made binary, if=0(false), then left as it is' ) parser.add_argument( '--onset-trainable', type=int, default=0, help= 'If=1(true), then onsetCNN will also get trained in finetuning stage, if=0(false) then kept fixed' ) # Model Parameters parser.add_argument('--seq-dur', type=float, default=6.0, help='Sequence duration in seconds' 'value of <=0.0 will use full/variable length') parser.add_argument( '--unidirectional', action='store_true', default=False, help='Use unidirectional LSTM instead of bidirectional') parser.add_argument('--nfft', type=int, default=4096, help='STFT fft size and window size') parser.add_argument('--nhop', type=int, default=1024, help='STFT hop size') # ============================================================================= # parser.add_argument('--nfft', type=int, default=2048, # help='STFT fft size and window size') # parser.add_argument('--nhop', type=int, default=512, # help='STFT hop size') # ============================================================================= parser.add_argument('--n-mels', type=int, default=80, help='Number of bins in mel spectrogram') parser.add_argument( '--hidden-size', type=int, default=512, help='hidden size parameter of dense bottleneck layers') parser.add_argument('--bandwidth', type=int, default=16000, help='maximum model bandwidth in herz') parser.add_argument('--nb-channels', type=int, default=2, help='set number of channels for model (1, 2)') parser.add_argument('--nb-workers', type=int, default=4, help='Number of workers for dataloader.') # Misc Parameters parser.add_argument('--quiet', action='store_true', default=False, help='less verbose during training') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') args, _ = parser.parse_known_args() use_cuda = not args.no_cuda and torch.cuda.is_available() print("Using GPU:", use_cuda) print("Using Torchaudio: ", utils._torchaudio_available()) dataloader_kwargs = { 'num_workers': args.nb_workers, 'pin_memory': True } if use_cuda else {} repo_dir = os.path.abspath(os.path.dirname(__file__)) repo = Repo(repo_dir) commit = repo.head.commit.hexsha[:7] # use jpg or npy torch.manual_seed(args.seed) random.seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") torch.autograd.set_detect_anomaly(True) train_dataset, valid_dataset, args = data.load_datasets(parser, args) print("TRAIN DATASET", train_dataset) print("VALID DATASET", valid_dataset) # create output dir if not exist target_path = Path(args.output) target_path.mkdir(parents=True, exist_ok=True) train_sampler = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **dataloader_kwargs) valid_sampler = torch.utils.data.DataLoader(valid_dataset, batch_size=1, **dataloader_kwargs) if args.model: scaler_mean = None scaler_std = None else: scaler_mean, scaler_std = get_statistics(args, train_dataset) max_bin = utils.bandwidth_to_max_bin(train_dataset.sample_rate, args.nfft, args.bandwidth) unmix = model_mtl.OpenUnmix_mtl( input_mean=scaler_mean, input_scale=scaler_std, nb_channels=args.nb_channels, hidden_size=args.hidden_size, n_fft=args.nfft, n_hop=args.nhop, max_bin=max_bin, sample_rate=train_dataset.sample_rate).to(device) #Read trained onset detection network (Model through which target spectrogram is passed) detect_onset = model.onsetCNN().to(device) detect_onset.load_state_dict( torch.load(args.onset_model, map_location='cuda:0')) #Model through which separated output is passed # detect_onset_training = model.onsetCNN().to(device) # detect_onset_training.load_state_dict(torch.load(args.onset_model, map_location='cuda:0')) for child in detect_onset.children(): for param in child.parameters(): param.requires_grad = False #If onset trainable is false, then we want to keep the weights of this moel fixed # if (args.onset_trainable == 0): # for child in detect_onset_training.children(): # for param in child.parameters(): # param.requires_grad = False # #FOR CHECKING, REMOVE LATER # for child in detect_onset_training.children(): # for param in child.parameters(): # print(param.requires_grad) optimizer = torch.optim.Adam(unmix.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=args.lr_decay_gamma, patience=args.lr_decay_patience, cooldown=10) es = utils.EarlyStopping(patience=args.patience) # if a model is specified: resume training if args.model: model_path = Path(args.model).expanduser() with open(Path(model_path, args.target + '.json'), 'r') as stream: results = json.load(stream) target_model_path = Path(model_path, args.target + ".chkpnt") checkpoint = torch.load(target_model_path, map_location=device) unmix.load_state_dict(checkpoint['state_dict']) #Only when onse is trainable and when that finetuning is being resumed from a point where it is left off, then read the onset state_dict # if ((args.onset_trainable==1)and(args.finetune==0)): # detect_onset_training.load_state_dict(checkpoint['onset_state_dict']) # print("Reading saved onset model") # else: # print("Not reading saved onset model") if (args.finetune == 0): optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) # train for another epochs_trained t = tqdm.trange(results['epochs_trained'], results['epochs_trained'] + args.epochs + 1, disable=args.quiet) print("PICKUP WHERE LEFT OFF", args.finetune) train_losses = results['train_loss_history'] train_mse_losses = results['train_mse_loss_history'] train_bce_losses = results['train_bce_loss_history'] valid_losses = results['valid_loss_history'] valid_mse_losses = results['valid_mse_loss_history'] valid_bce_losses = results['valid_bce_loss_history'] train_times = results['train_time_history'] best_epoch = results['best_epoch'] es.best = results['best_loss'] es.num_bad_epochs = results['num_bad_epochs'] else: t = tqdm.trange(1, args.epochs + 1, disable=args.quiet) train_losses = [] train_mse_losses = [] train_bce_losses = [] print("NOT PICKUP WHERE LEFT OFF", args.finetune) valid_losses = [] valid_mse_losses = [] valid_bce_losses = [] train_times = [] best_epoch = 0 #es.best = results['best_loss'] #es.num_bad_epochs = results['num_bad_epochs'] # else start from 0 else: t = tqdm.trange(1, args.epochs + 1, disable=args.quiet) train_losses = [] train_mse_losses = [] train_bce_losses = [] valid_losses = [] valid_mse_losses = [] valid_bce_losses = [] train_times = [] best_epoch = 0 for epoch in t: t.set_description("Training Epoch") end = time.time() train_loss, train_mse_loss, train_bce_loss = train( args, unmix, device, train_sampler, optimizer, detect_onset=detect_onset) #train_mse_loss = train(args, unmix, device, train_sampler, optimizer, detect_onset=detect_onset)[1] #train_bce_loss = train(args, unmix, device, train_sampler, optimizer, detect_onset=detect_onset)[2] valid_loss, valid_mse_loss, valid_bce_loss = valid( args, unmix, device, valid_sampler, detect_onset=detect_onset) #valid_mse_loss = valid(args, unmix, device, valid_sampler, detect_onset=detect_onset)[1] #valid_bce_loss = valid(args, unmix, device, valid_sampler, detect_onset=detect_onset)[2] scheduler.step(valid_loss) train_losses.append(train_loss) train_mse_losses.append(train_mse_loss) train_bce_losses.append(train_bce_loss) valid_losses.append(valid_loss) valid_mse_losses.append(valid_mse_loss) valid_bce_losses.append(valid_bce_loss) t.set_postfix(train_loss=train_loss, val_loss=valid_loss) stop = es.step(valid_loss) #from matplotlib import pyplot as plt # ============================================================================= # plt.figure(figsize=(16,12)) # plt.subplot(2, 2, 1) # plt.title("Training loss") # plt.plot(train_losses,label="Training") # plt.xlabel("Iterations") # plt.ylabel("Loss") # plt.legend() # plt.show() # #plt.savefig(Path(target_path, "train_plot.pdf")) # # plt.figure(figsize=(16,12)) # plt.subplot(2, 2, 2) # plt.title("Validation loss") # plt.plot(valid_losses,label="Validation") # plt.xlabel("Iterations") # plt.ylabel("Loss") # plt.legend() # plt.show() # #plt.savefig(Path(target_path, "val_plot.pdf")) # ============================================================================= if valid_loss == es.best: best_epoch = epoch utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': unmix.state_dict(), 'best_loss': es.best, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'onset_state_dict': detect_onset.state_dict() }, is_best=valid_loss == es.best, path=target_path, target=args.target) # save params params = { 'epochs_trained': epoch, 'args': vars(args), 'best_loss': es.best, 'best_epoch': best_epoch, 'train_loss_history': train_losses, 'train_mse_loss_history': train_mse_losses, 'train_bce_loss_history': train_bce_losses, 'valid_loss_history': valid_losses, 'valid_mse_loss_history': valid_mse_losses, 'valid_bce_loss_history': valid_bce_losses, 'train_time_history': train_times, 'num_bad_epochs': es.num_bad_epochs, 'commit': commit } with open(Path(target_path, args.target + '.json'), 'w') as outfile: outfile.write(json.dumps(params, indent=4, sort_keys=True)) train_times.append(time.time() - end) if stop: print("Apply Early Stopping") break # ============================================================================= # plt.figure(figsize=(16,12)) # plt.subplot(2, 2, 1) # plt.title("Training loss") # #plt.plot(train_losses,label="Training") # plt.plot(train_losses,label="Training") # plt.xlabel("Iterations") # plt.ylabel("Loss") # plt.legend() # #plt.show() # # plt.figure(figsize=(16,12)) # plt.subplot(2, 2, 2) # plt.title("Validation loss") # plt.plot(valid_losses,label="Validation") # plt.xlabel("Iterations") # plt.ylabel("Loss") # plt.legend() # plt.show() # plt.savefig(Path(target_path, "train_val_plot.pdf")) # #plt.savefig(Path(target_path, "train_plot.pdf")) # ============================================================================= print("TRAINING DONE!!") plt.figure() plt.title("Training loss") plt.plot(train_losses, label="Training") plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend() plt.savefig(Path(target_path, "train_plot.pdf")) plt.figure() plt.title("Validation loss") plt.plot(valid_losses, label="Validation") plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend() plt.savefig(Path(target_path, "val_plot.pdf")) plt.figure() plt.title("Training BCE loss") plt.plot(train_bce_losses, label="Training") plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend() plt.savefig(Path(target_path, "train_bce_plot.pdf")) plt.figure() plt.title("Validation BCE loss") plt.plot(valid_bce_losses, label="Validation") plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend() plt.savefig(Path(target_path, "val_bce_plot.pdf")) plt.figure() plt.title("Training MSE loss") plt.plot(train_mse_losses, label="Training") plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend() plt.savefig(Path(target_path, "train_mse_plot.pdf")) plt.figure() plt.title("Validation MSE loss") plt.plot(valid_mse_losses, label="Validation") plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend() plt.savefig(Path(target_path, "val_mse_plot.pdf"))
def train_deep(): import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import Dataset from model import FCN from torch.optim import lr_scheduler def train(model, device, train_loader, optimizer): model.train() train_loss = 0 for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) train_loss += loss.item() loss.backward() optimizer.step() return train_loss / len(train_loader.dataset) def test(model, device, test_loader): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss( output, target, reduction="sum").item() # sum up batch loss pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) test_acc = 100.0 * correct / len(test_loader.dataset) return test_loss, test_acc # training settings batch_size = 32 test_batch_size = 1000 epochs = 500 patience = 30 # for early stopping use_cuda = torch.cuda.is_available() torch.manual_seed(9) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( PoseDataset([root_dir / d for d in train_data_dirs]), batch_size=batch_size, shuffle=True, **kwargs, ) test_loader = torch.utils.data.DataLoader( PoseDataset([root_dir / d for d in test_data_dirs], mode="test"), batch_size=test_batch_size, shuffle=True, **kwargs, ) model = FCN().to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3, amsgrad=True) early_stopping = utils.EarlyStopping(patience, Path("results")) for epoch in range(1, epochs + 1): train_loss = train(model, device, train_loader, optimizer) test_loss, test_acc = test(model, device, test_loader) print(f"epoch: {epoch:>3}, train_loss: {train_loss:.4f}, ", end="") print(f"test_loss: {test_loss:.4f}, test_acc: {test_acc:.3f}") early_stopping(test_loss, test_acc, model) if early_stopping.early_stop: print("Early stopping activated") break print(f"deep model acc: {early_stopping.best_acc}")
acc = 100.0 * correct / total if acc > best_acc: best_acc = acc return acc, best_acc if __name__ == "__main__": try: # trial get next parameter from network morphism tuner RCV_CONFIG = nni.get_next_parameter() logger.debug(RCV_CONFIG) parse_rev_args(RCV_CONFIG) train_acc = 0.0 best_acc = 0.0 early_stop = utils.EarlyStopping(mode="max") for ep in range(args.epochs): train_acc = train(ep) test_acc, best_acc = test(ep) nni.report_intermediate_result(test_acc) logger.debug(test_acc) if early_stop.step(test_acc): break # trial report best_acc to tuner nni.report_final_result(best_acc) except Exception as exception: logger.exception(exception) raise
#plt.imshow(grid_img.permute(1, 2, 0)) #plt.figure() #plt.title('Ground Truths') #gt_grid = vutils.make_grid(Y, nrow=4) #plt.imshow(gt_grid.permute(1,2,0)) model = models.unet(n_channels=3, n_classes=1) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) #summary(model, input_size=(3, 144, 144)) opt = optim.SGD(model.parameters(), lr=0.01, momentum=0.95) scheduler = optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', patience=5, verbose=True) early_stopping = utils.EarlyStopping(patience=8, verbose=True) print('='*30) print('Training') print('='*30) epoch_train_loss = [] epoch_val_loss = [] epoch_train_dsc = [] epoch_val_dsc = [] for epoch in range(num_epochs): train_losses = [] train_dsc = [] val_losses = [] val_dsc = []
p.start() single_acc, current_ep = train_eval(init_search_space_point, RCV_CONFIG, int(nni.get_sequence_id())) print("HPO-" + str(train_num) + ",hyperparameters:" + str(init_search_space_point) + ",best_val_acc:" + str(single_acc)) best_final = single_acc searched_space_point = init_search_space_point if int(nni.get_sequence_id()) > 3 * args.slave - 1: dict_first_data = init_search_space_point TPE.receive_trial_result(train_num, dict_first_data, single_acc) TPEearlystop = utils.EarlyStopping(patience=3, mode="max") for train_num in range(1, args.maxTPEsearchNum): params = TPE.generate_parameters(train_num) start_date = time.strftime('%m/%d/%Y, %H:%M:%S', time.localtime(time.time())) current_hyperparameter = params hp_path = experiment_path + '/hyperparameter_epoch/' + str( nni.get_trial_id()) + '/' + str(train_num) + '.json' with open(hp_path, 'w') as f: json.dump( { 'get_sequence_id': int(nni.get_sequence_id()), 'hyperparameter': current_hyperparameter, 'epoch': 0,
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps): dfx = pd.read_csv(training_file) df_train = dfx[dfx.kfold != fold].reset_index(drop = True) df_valid = dfx[dfx.kfold == fold].reset_index(drop = True) # 训练集 train_dataset = TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text = df_train.selected_text.values, tokenizer = tokenizer, max_len = max_len ) # 验证集 valid_dataset = TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text = df_valid.selected_text.values, tokenizer = tokenizer, max_len = max_len ) train_sampler, valid_sampler = None, None if args.shuffle: train_sampler = RandomSampler(train_dataset) valid_sampler = SequentialSampler(valid_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = train_batch_size, num_workers = 4, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size = valid_batch_size, num_workers = 2, sampler=valid_sampler ) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(roberta_path) model_config.output_hidden_states = True model = TweetModel(roberta_path = roberta_path, conf = model_config) model.to(device) num_train_steps = int(len(df_train) / train_batch_size * epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.003}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr = lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps ) if args.fp16: # try: # from apex import amp # except ImportError: # raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.parallel: model = torch.nn.DataParallel(model) es = utils.EarlyStopping(patience = patience, mode = "max") print("Training is Starting for fold", fold) for epoch in range(epochs): train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler) jaccard = eval_fn(valid_data_loader, model, device) print("Jaccard Score = ", jaccard) experiment.log_metric("jaccard", jaccard) es(jaccard, model, model_path = f"{save_path}/model_{fold}.bin") if es.early_stop: print("Early stopping") break del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader import gc gc.collect() torch.cuda.empty_cache()
def train(): """ Train model for a speciied fold """ # Read train csv and dev csv df_train = pd.read_csv(config.TRAIN_FILE) df_valid = pd.read_csv(config.DEV_FILE) # Instantiate TweetDataset with training data train_dataset = SiameseDataset(query=df_train.sentence1.values, question=df_train.sentence2.values, label=df_train.label.values) if os.path.exists(config.train_features): train_dataset = load_pkl_data(config.train_features) else: train_dataset = [item for item in train_dataset] save_pkl_data(train_dataset, config.train_features) # Instantiate DataLoader with `train_dataset` # This is a generator that yields the dataset in batches train_data_loader = torch.utils.data.DataLoader( train_dataset, shuffle=True, batch_size=config.TRAIN_BATCH_SIZE) # Instantiate TweetDataset with validation data valid_dataset = SiameseDataset(query=df_valid.sentence1.values, question=df_valid.sentence2.values, label=df_valid.label.values) if os.path.exists(config.valid_features): valid_dataset = load_pkl_data(config.valid_features) else: valid_dataset = [item for item in valid_dataset] save_pkl_data(valid_dataset, config.valid_features) # Instantiate DataLoader with `valid_dataset` valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE) # Set device as `cuda` (GPU) device = torch.device("cuda:2") # Load pretrained BERT (bert-base-uncased) model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) # Output hidden states # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers model_config.output_hidden_states = True # Instantiate our model with `model_config` model = SiameseWmdModel(conf=model_config, pretrained_model_path=config.BERT_PATH) # Move the model to the GPU model.to(device) # Calculate the number of training steps num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) # Get the list of named parameters param_optimizer = list(model.named_parameters()) # Specify parameters where weight decay shouldn't be applied no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # Define two sets of parameters: those with weight decay, and those without optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] # Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5 optimizer = AdamW(optimizer_parameters, lr=3e-5) # Create a scheduler to set the learning rate at each training step # "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html) # Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # Apply early stopping with patience of 2 # This means to stop training new epochs when 2 rounds have passed without any improvement es = utils.EarlyStopping(patience=2, mode="max") thresholds = [0.1, 0.15, 0.20] best_f1 = 0 best_th = 0 for threshold in thresholds: # I'm training only for 3 epochs even though I specified 5!!! for epoch in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, threshold=threshold) acc, f1, auc = eval_fn(valid_data_loader, model, device) # logger.info(f"acc = {acc}, f1 score = {f1}") es(f1, model, model_path=config.MODEL_SAVE_PATH) if es.early_stop: if f1 > best_f1: best_f1 = f1 best_th = threshold print("Early stopping ********") break logger.info(f"best threshold:{best_th}, best f1 :{best_f1}")
def main(): parser = argparse.ArgumentParser(description='Open Unmix Trainer') # which target do we want to train? parser.add_argument('--target', type=str, default='vocals', help='target source (will be passed to the dataset)') # Dataset paramaters parser.add_argument('--dataset', type=str, default="aligned", choices=[ 'musdb', 'aligned', 'sourcefolder', 'trackfolder_var', 'trackfolder_fix' ], help='Name of the dataset.') parser.add_argument('--root', type=str, help='root path of dataset', default='../rec_data_new/') parser.add_argument('--output', type=str, default="../out_unmix/model_new_data_aug_tl", help='provide output path base folder name') #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default='../out_unmix/model_new_data') #parser.add_argument('--model', type=str, help='Path to checkpoint folder') parser.add_argument('--model', type=str, help='Path to checkpoint folder', default='umxhq') # Trainig Parameters parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--batch-size', type=int, default=32) parser.add_argument('--lr', type=float, default=0.0001, help='learning rate, defaults to 1e-3') parser.add_argument( '--patience', type=int, default=140, help='maximum number of epochs to train (default: 140)') parser.add_argument('--lr-decay-patience', type=int, default=80, help='lr decay patience for plateau scheduler') parser.add_argument('--lr-decay-gamma', type=float, default=0.3, help='gamma of learning rate scheduler decay') parser.add_argument('--weight-decay', type=float, default=0.0000000001, help='weight decay') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') # Model Parameters parser.add_argument('--seq-dur', type=float, default=6.0, help='Sequence duration in seconds' 'value of <=0.0 will use full/variable length') parser.add_argument( '--unidirectional', action='store_true', default=False, help='Use unidirectional LSTM instead of bidirectional') parser.add_argument('--nfft', type=int, default=4096, help='STFT fft size and window size') parser.add_argument('--nhop', type=int, default=1024, help='STFT hop size') parser.add_argument( '--hidden-size', type=int, default=512, help='hidden size parameter of dense bottleneck layers') parser.add_argument('--bandwidth', type=int, default=16000, help='maximum model bandwidth in herz') parser.add_argument('--nb-channels', type=int, default=2, help='set number of channels for model (1, 2)') parser.add_argument('--nb-workers', type=int, default=4, help='Number of workers for dataloader.') # Misc Parameters parser.add_argument('--quiet', action='store_true', default=False, help='less verbose during training') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') args, _ = parser.parse_known_args() use_cuda = not args.no_cuda and torch.cuda.is_available() print("Using GPU:", use_cuda) print("Using Torchaudio: ", utils._torchaudio_available()) dataloader_kwargs = { 'num_workers': args.nb_workers, 'pin_memory': True } if use_cuda else {} repo_dir = os.path.abspath(os.path.dirname(__file__)) repo = Repo(repo_dir) commit = repo.head.commit.hexsha[:7] # use jpg or npy torch.manual_seed(args.seed) random.seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") train_dataset, valid_dataset, args = data.load_datasets(parser, args) print("TRAIN DATASET", train_dataset) print("VALID DATASET", valid_dataset) # create output dir if not exist target_path = Path(args.output) target_path.mkdir(parents=True, exist_ok=True) train_sampler = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **dataloader_kwargs) valid_sampler = torch.utils.data.DataLoader(valid_dataset, batch_size=1, **dataloader_kwargs) # ============================================================================= # if args.model: # scaler_mean = None # scaler_std = None # # else: # ============================================================================= scaler_mean, scaler_std = get_statistics(args, train_dataset) max_bin = utils.bandwidth_to_max_bin(train_dataset.sample_rate, args.nfft, args.bandwidth) unmix = model.OpenUnmix(input_mean=scaler_mean, input_scale=scaler_std, nb_channels=args.nb_channels, hidden_size=args.hidden_size, n_fft=args.nfft, n_hop=args.nhop, max_bin=max_bin, sample_rate=train_dataset.sample_rate).to(device) optimizer = torch.optim.Adam(unmix.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=args.lr_decay_gamma, patience=args.lr_decay_patience, cooldown=10) es = utils.EarlyStopping(patience=args.patience) # if a model is specified: resume training if args.model: # disable progress bar err = io.StringIO() with redirect_stderr(err): unmix = torch.hub.load('sigsep/open-unmix-pytorch', 'umxhq', target=args.target, device=device, pretrained=True) # ============================================================================= # model_path = Path(args.model).expanduser() # with open(Path(model_path, args.target + '.json'), 'r') as stream: # results = json.load(stream) # # target_model_path = Path(model_path, args.target + ".chkpnt") # checkpoint = torch.load(target_model_path, map_location=device) # unmix.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) # scheduler.load_state_dict(checkpoint['scheduler']) # # train for another epochs_trained # t = tqdm.trange( # results['epochs_trained'], # results['epochs_trained'] + args.epochs + 1, # disable=args.quiet # ) # train_losses = results['train_loss_history'] # valid_losses = results['valid_loss_history'] # train_times = results['train_time_history'] # best_epoch = results['best_epoch'] # es.best = results['best_loss'] # es.num_bad_epochs = results['num_bad_epochs'] # # else start from 0 # ============================================================================= t = tqdm.trange(1, args.epochs + 1, disable=args.quiet) train_losses = [] valid_losses = [] train_times = [] best_epoch = 0 for epoch in t: t.set_description("Training Epoch") end = time.time() train_loss = train(args, unmix, device, train_sampler, optimizer) valid_loss = valid(args, unmix, device, valid_sampler) scheduler.step(valid_loss) train_losses.append(train_loss) valid_losses.append(valid_loss) t.set_postfix(train_loss=train_loss, val_loss=valid_loss) stop = es.step(valid_loss) from matplotlib import pyplot as plt plt.figure(figsize=(16, 12)) plt.subplot(2, 2, 1) plt.title("Training loss") plt.plot(train_losses, label="Training") plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend() plt.show() #plt.savefig(Path(target_path, "train_plot.pdf")) plt.figure(figsize=(16, 12)) plt.subplot(2, 2, 2) plt.title("Validation loss") plt.plot(valid_losses, label="Validation") plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend() plt.show() #plt.savefig(Path(target_path, "val_plot.pdf")) if valid_loss == es.best: best_epoch = epoch utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': unmix.state_dict(), 'best_loss': es.best, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, is_best=valid_loss == es.best, path=target_path, target=args.target) # save params params = { 'epochs_trained': epoch, 'args': vars(args), 'best_loss': es.best, 'best_epoch': best_epoch, 'train_loss_history': train_losses, 'valid_loss_history': valid_losses, 'train_time_history': train_times, 'num_bad_epochs': es.num_bad_epochs, 'commit': commit } with open(Path(target_path, args.target + '.json'), 'w') as outfile: outfile.write(json.dumps(params, indent=4, sort_keys=True)) train_times.append(time.time() - end) if stop: print("Apply Early Stopping") break
def run(): torch.manual_seed(seed) device = xm.xla_device() model = MX.to(device) # DataLoaders train_dataset = TweetDataset(args=args, df=train_df, mode="train", fold=args.fold_index, tokenizer=tokenizer) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, drop_last=False, num_workers=2) valid_dataset = TweetDataset(args=args, df=train_df, mode="valid", fold=args.fold_index, tokenizer=tokenizer) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, sampler=valid_sampler, num_workers=1, drop_last=False) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, ] num_train_steps = int(num_train_dpoints / args.batch_size / xm.xrt_world_size() * args.epochs) optimizer = AdamW(optimizer_parameters, lr=args.learning_rate * xm.xrt_world_size(), eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=num_train_steps) xm.master_print("Training is Starting ...... ") best_jac = 0 best_loss = 9999 early_stopping = utils.EarlyStopping(patience=3, mode="max") for epoch in range(args.epochs): para_loader = pl.ParallelLoader(train_loader, [device]) train_loss = train(args, para_loader.per_device_loader(device), model, device, optimizer, scheduler, epoch, f, args.max_seq_len) para_loader = pl.ParallelLoader(valid_loader, [device]) valid_jac, valid_loss = valid( args, para_loader.per_device_loader(device), model, device, tokenizer, epoch, f, args.max_seq_len) jac = xm.mesh_reduce("jac_reduce", valid_jac, reduce_fn) val_loss = xm.mesh_reduce("valid_loss_reduce", valid_loss, reduce_fn) xm.master_print(f"**** Epoch {epoch+1} **==>** Jaccard = {jac}") xm.master_print( f"**** Epoch {epoch+1} **==>** valid_loss = {val_loss}") log_ = f"**** Epoch {epoch+1} **==>** Jaccard = {jac}" f.write(log_ + "\n\n") if jac > best_jac: xm.master_print("**** Model Improved !!!! Saving Model") xm.save( model.state_dict(), os.path.join(args.save_path, f"fold_{args.fold_index}")) best_jac = jac early_stopping(jac, model, "none") if early_stopping.early_stop: print("Early stopping") break
'decomp' : torch.zeros((1,), requires_grad=True, device=device), 'ihm' : torch.zeros((1,), requires_grad=True, device=device), 'los' : torch.zeros((1,), requires_grad=True, device=device), 'pheno' : torch.zeros((1,), requires_grad=True, device=device), 'readmit': torch.zeros((1,), requires_grad=True, device=device), 'ltm': torch.zeros((1,), requires_grad=True, device=device), } #If using uncertianty weighting, use the below optimizer to add the log vars #Leave out readmit task due to poor performance #optimizer = torch.optim.Adam(([p for p in model.parameters()] + [log_var[t] for t in log_var if t != 'readmit']), lr=learning_rate) #for uncertainty weighting #-------------------- define optimizer and other hyperparams ----------------# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=10, gamma=0.3) early_stopper = utils.EarlyStopping(experiment_name = experiment) #------------------------load word embedings---------------------# embedding_layer = nn.Embedding(vectors.shape[0], vectors.shape[1]) embedding_layer.weight.data.copy_(torch.from_numpy(vectors)) embedding_layer.weight.requires_grad = False #-------------------------- Define the train/val/test dataloaders ------------# #As long as datapaths are correct, no changes should be necessary. train_mm_dataset = MultiModal_Dataset(train_val_ts_root_dir, train_val_text_root_dir,train_val_tab_root_dir, train_listfile, discretizer, train_val_starttime_path,\ regression, bin_type, None, ihm_pos, los_pos, use_text, use_ts, use_tab, decay, w2i_lookup, max_text_length, max_num_notes) val_mm_dataset = MultiModal_Dataset(train_val_ts_root_dir, train_val_text_root_dir, train_val_tab_root_dir, val_listfile, discretizer, train_val_starttime_path,\ regression, bin_type, None, ihm_pos, los_pos, use_text, use_ts, use_tab, decay, w2i_lookup, max_text_length, max_num_notes) test_mm_dataset = MultiModal_Dataset(test_ts_root_dir, test_text_root_dir,test_tab_root_dir, test_listfile, discretizer, test_starttime_path,\ regression, bin_type, None, ihm_pos, los_pos, use_text, use_ts, use_tab, decay, w2i_lookup, max_text_length, max_num_notes)
def run(fold=0): # kfold type of data input data = pd.read_csv(config.TRAIN_FOLDS_FILE) df_train = data[data['kfold'] != fold].reset_index(drop=True) df_valid = data[data['kfold'] == fold].reset_index(drop=True) train_data = CommentData(comments=df_train['Comment'], labels=df_train['Label_encoded'], sentiments=df_train['Sentiment_encoded']) train_dataloader = torch.utils.data.DataLoader( train_data, batch_size=config.TRAIN_BATCH_SIZE, # num_workers = 4 ) valid_data = CommentData(comments=df_valid['Comment'], labels=df_valid['Label_encoded'], sentiments=df_valid['Sentiment_encoded']) valid_dataloader = torch.utils.data.DataLoader( valid_data, batch_size=config.VALID_BATCH_SIZE, # num_workers = 4 ) device = torch.device('cuda') model_config = RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = SentimentModel(model_config, config.OUTPUT_SIZE) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # train_fn(data_loader, model, device, optimizer, scheduler=None) train_loss_rec = [] eval_loss_rec = [] early_stopping = utils.EarlyStopping(patience=5, mode='min') for epoch in range(config.EPOCHS): print(f'########### fold = {fold} epoch = {epoch} ############') loss_train = engine.train_fn(data_loader=train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler) train_loss_rec.append(loss_train) losses_eval = engine.eval_fn(valid_dataloader, model, device) eval_loss_rec.append(losses_eval) print(f'train_loss = {loss_train} eval_loss = {losses_eval}') # print(f'save model_{fold}.bin') # torch.save(model.state_dict(), config.OUTPUT_PATH + f'/model_{fold}.bin') early_stopping(losses_eval, model, model_path=config.OUTPUT_PATH + f'/model_label_{fold}.bin') if early_stopping.early_stop: print('Early stopping') break