def fit(self, model, train_loader, val_loader, test_loader): optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4, weight_decay=1e-5) stopper = EarlyStopping(self.model_path, self.tasks, patience=self.patience) self._train(model, train_loader, val_loader, self.loss_fn, optimizer, stopper) stopper.load_checkpoint(model) test_results_dict = self._eval(model, test_loader) for metric in self.metrics: print(f"test {metric}:{test_results_dict[metric]['mean']}") return model, test_results_dict
def trainNet(model, train_loader, val_loader, device, static_map, start_epoch=0, globaliter_=0): # Print all of the hyper parameters of the training iteration: print("===== HYPERPARAMETERS =====") print("batch_size=", config['dataloader']['batch_size']) print("epochs=", config['num_epochs']) print('starting from epoch %i' % start_epoch) print("learning_rate=", config['optimizer']['lr']) print("network_depth=", config['model']['depth']) print("=" * 30) # define the optimizer & learning rate optim = torch.optim.SGD(model.parameters(), **config['optimizer']) scheduler = StepLR(optim, step_size=config['lr_step_size'], gamma=config['lr_gamma']) if config['cont_model_path'] is not None: log_dir = config['cont_model_path'] else: log_dir = 'runs/Unet-' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S-") + \ '-'.join(config['dataset']['cities']) writer = Visualizer(log_dir) # dump config file with open(os.path.join(log_dir, 'config.json'), 'w') as fp: json.dump(config, fp) # Time for printing training_start_time = time.time() globaliter = globaliter_ # initialize the early_stopping object early_stopping = EarlyStopping(log_dir, patience=config['patience'], verbose=True) # Loop for n_epochs for epoch_idx, epoch in enumerate(range(start_epoch, config['num_epochs'])): writer.write_lr(optim, epoch) # train for one epoch globaliter = train(model, train_loader, static_map, optim, device, writer, epoch, globaliter) # At the end of the epoch, do a pass on the validation set val_loss = validate(model, val_loader, static_map, device, writer, globaliter) # At the end of the epoch, do a pass on the validation set only considering the test times # val_loss_testtimes = validate(model, val_loader_ttimes, device, writer, globaliter, if_testtimes=True) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(val_loss, model, epoch+1, globaliter) if early_stopping.early_stop: print("Early stopping") break if config['debug'] and epoch_idx >= 0: break scheduler.step(epoch) print("Training finished, took {:.2f}s".format(time.time() - training_start_time)) # remember to close tensorboard writer writer.close()
def train_test(self): #load model if model exists weigh initialization if self.config.load_model is True: self.model.load_model() else: self.model.weight_init() print('weight is initilized') # optimizer self.momentum = 0.9 self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=1.0) # scheduler = lr_scheduler.StepLR(self.optimizer, step_size=70, gamma=0.01) scheduler = lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9) # loss function if self.config.gpu_mode: self.model.cuda() self.MSE_loss = nn.MSELoss().cuda() else: self.MSE_loss = nn.MSELoss() print('---------- Networks architecture -------------') utils.print_network(self.model) print('----------------------------------------------') # load dataset train_data_loader = self.data_train test_data_loader = self.data_test ################# Train ################# print('Training is started.') avg_loss = [] avg_loss_test = [] avg_loss_log_test = [] step = 0 es = EarlyStopping(patience=8) self.model.train() # It just sets the training mode.model.eval() to set testing mode for epoch in range(self.config.num_epochs): scheduler.step() epoch_loss = 0 for iter, (input, target, _) in enumerate(train_data_loader): # input data (low resolution image) if self.config.gpu_mode: x_ = Variable(input.cuda()) y_ = Variable(target.cuda()) else: x_ = Variable(input) y_ = Variable(target) # update network self.optimizer.zero_grad() model_out = self.model(x_) loss = torch.sqrt(self.MSE_loss(model_out, y_)) loss.backward() # 结果得到是tensor self.optimizer.step() # log epoch_loss += loss print("Epoch: [%2d] [%4d/%4d] loss: %.8f" % ((epoch + 1), (iter + 1), len(train_data_loader), loss)) # tensorboard logging self.logger.scalar_summary('loss', loss, step + 1) step += 1 # avg. loss per epoch avg_loss.append((epoch_loss / len(train_data_loader)).detach().cpu().numpy()) if (epoch + 1) % self.config.save_epochs == 0: self.model.save_model(epoch + 1) # caculate test loss with torch.no_grad(): loss_test, loss_log_test = self.test(test_data_loader) epoch_loss_test = loss_test / len(test_data_loader) epoch_loss_log_test = loss_log_test / len(test_data_loader) avg_loss_test.append(float(epoch_loss_test)) avg_loss_log_test.append(float(epoch_loss_log_test)) # if es.step(float(epoch_loss_test)): # self.model.save_model(epoch=None) # print('Early stop at %2d epoch' % (epoch + 1)) # break # Plot avg. loss utils.plot_loss(self.config, [avg_loss, avg_loss_log_test]) utils.plot_loss(self.config, [avg_loss_test], origin=True) print('avg_loss: ', avg_loss[-1]) print('avg_loss_log with original data: ', avg_loss_test[-1]) print('avg_loss_log with log data: ', avg_loss_log_test[-1]) print("Training and test is finished.") # Save final trained parameters of model self.model.save_model(epoch=None)
writer = Visualizer(log_dir) with open(os.path.join(log_dir, 'config.json'), 'w') as fp: json.dump(config, fp) # define device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # model model = diff_pool_net2(dataset, **config['model']).to(device) data = data.to(device) lr = config['optimizer']['lr'] optimizer = torch.optim.Adam(model.parameters(), lr=lr) # initialize the early_stopping object early_stopping = EarlyStopping(log_dir, patience=config['patience'], verbose=False) best_val_acc = test_acc = 0 for epoch in range(1, config['epochs']): output_dict = train() accs, s = test() train_acc, val_acc, tmp_test_acc = accs writer.write_lr(optimizer, epoch) if val_acc > best_val_acc: best_val_acc = val_acc test_acc = tmp_test_acc writer.write_acc_train(train_acc, epoch)
def bcn(config, data_file, embeddings, device, dataset, embeddings_type): # extensions : add 2 languages, use a combination of CoVe embeddings (like ELMo) name = "test_model" torch.manual_seed(123) inputs = data.Field(lower=True, include_lengths=True, batch_first=True) labels = data.Field(sequential=False, unk_token=None) print('Generating train, dev, test splits') if dataset == 'IWSLT': # using the IWSLT 2016 TED talk translation task train, dev, test = datasets.IWSLT.splits(root=data_file, exts=['.en', '.de'], fields=[inputs, inputs]) elif dataset == 'SST-2': train, dev, test = datasets.SST.splits( text_field=inputs, label_field=labels, root=data_file, fine_grained=False, train_subtrees=True, filter_pred=lambda ex: ex.label != 'neutral') elif dataset == 'SST-5': train, dev, test = datasets.SST.splits(text_field=inputs, label_field=labels, root=data_file, fine_grained=True, train_subtrees=True) elif dataset == 'IMDB': train, test = datasets.IMDB.splits(text_field=inputs, label_field=labels, root=data_file) train, dev = train.split( split_ratio=0.9, stratified=True) # 0.9 in order to be close to the paper elif dataset == 'TREC-6': train, test = datasets.TREC.splits(text_field=inputs, label_field=labels, root=data_file, fine_grained=False) train, dev = train.split(split_ratio=0.9, stratified=True) elif dataset == 'TREC-50': train, test = datasets.TREC.splits(text_field=inputs, label_field=labels, root=data_file, fine_grained=True) train, dev = train.split() elif dataset == 'SNLI': train, dev, test = datasets.SNLI.splits(text_field=inputs, label_field=labels, root=data_file) else: print('Invalid dataset name detected...') return print('Building vocabulary') inputs.build_vocab(train, dev, test) inputs.vocab.load_vectors( vectors=GloVe(name='840B', dim=300, cache=embeddings)) labels.build_vocab(train, dev, test) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_size=config["train_batch_size"], device=torch.device(device) if device >= 0 else None, sort_within_batch=True) model = BCN(config=config, n_vocab=len(inputs.vocab), vocabulary=inputs.vocab.vectors, embeddings=embeddings, num_labels=len(labels.vocab.freqs), embeddings_type=embeddings_type) bcn_params = [ p for n, p in model.named_parameters() if "mtlstm" not in n and p.requires_grad ] criterion = nn.CrossEntropyLoss() optimizer = Adam(bcn_params, lr=0.001) if device != -1: model.to(device) print(model) total_params = sum(p.numel() for p in model.parameters()) total_trainable_params = sum(p.numel() for p in bcn_params if p.requires_grad) print("Total Params:", number_h(total_params)) print("Total Trainable Params:", number_h(total_trainable_params)) ##################################### # Training Pipeline ##################################### trainer = BCNTrainer(model=model, train_loader=train_iter, valid_loader=dev_iter, criterion=criterion, device="cpu" if device == -1 else 'cuda', config=config, optimizers=[optimizer]) print('Generating CoVe') #################################################################### # Experiment: logging and visualizing the training process #################################################################### exp = Experiment(name, config, src_dirs=None, output_dir=EXP_DIR) exp.add_metric("ep_loss", "line", "epoch loss class", ["TRAIN", "VAL"]) exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"]) exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"]) exp.add_value("epoch", title="epoch summary") exp.add_value("progress", title="training progress") #################################################################### # Training Loop #################################################################### best_loss = None early_stopping = EarlyStopping("min", config["patience"]) for epoch in range(1, config["epochs"] + 1): train_loss = trainer.train_epoch() print(model.w, model.gama) val_loss, y, y_pred = trainer.eval_epoch() # Calculate accuracy and f1-macro on the evaluation set exp.update_metric("ep_loss", train_loss.item(), "TRAIN") exp.update_metric("ep_loss", val_loss.item(), "VAL") exp.update_metric("ep_f1", 0, "TRAIN") exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL") exp.update_metric("ep_acc", 0, "TRAIN") exp.update_metric("ep_acc", acc(y, y_pred), "VAL") print() epoch_log = exp.log_metrics(["ep_loss", "ep_f1", "ep_acc"]) print(epoch_log) exp.update_value("epoch", epoch_log) # Save the model if the val loss is the best we've seen so far. if not best_loss or val_loss < best_loss: best_loss = val_loss trainer.best_acc = acc(y, y_pred) trainer.best_f1 = f1_macro(y, y_pred) trainer.checkpoint(name=name) if early_stopping.stop(val_loss): print("Early Stopping (according to cls loss)....") break print("\n" * 2) return best_loss, trainer.best_acc, trainer.best_f1
def train(config, model, train_loader, val_loader, optimizer): if not os.path.exists('./runs'): os.mkdir('./runs') if not os.path.exists('./checkpoints'): os.mkdir('./checkpoints') writer = SummaryWriter('./runs/{}'.format(config.exp_name)) early_stopping = EarlyStopping(save_dir=config.save_dir, model_type=config.exp_name, patience=config.patience, verbose=True) for epoch in tqdm(range(1, config.n_epochs + 1)): highlight_loss = [] # Training model.train() for batch_idx, data in enumerate(train_loader): # zero the grads optimizer.zero_grad() # handle the case of history vs. non-history training if len(data) == 4: # is_history = False vid_feat_tensor, gt_strided_binary, user_path, nframes = data # convert data to cuda vid_feat_tensor, gt_strided_binary = vid_feat_tensor.unsqueeze( dim=2).transpose(1, 3).cuda(), gt_strided_binary.view( 1, 1, -1).cuda() # forward to model output = model(vid_feat_tensor) else: # is_history = True i.e. len(data) = 5 vid_feat_tensor, gt_strided_binary, usr_hist_list, usr_path, nframes = data # check if usr_hist_list has some data if len(usr_hist_list) == 0: continue else: pass # convert data to cuda vid_feat_tensor, gt_strided_binary, usr_hist_list = vid_feat_tensor.unsqueeze( dim=2).transpose(1, 3).cuda(), gt_strided_binary.view( 1, 1, -1).cuda(), [ hist.float().cuda() for hist in usr_hist_list ] # forward to the model with history output = model(vid_feat_tensor, usr_hist_list) # compute loss loss = cross_entropy2d(output, gt_strided_binary) # backward and update the model loss.backward() optimizer.step() highlight_loss.append(loss.item()) if batch_idx % config.print_interval == 0: print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format( epoch, batch_idx + 1, len(train_loader), loss.item())) mean_highlight_loss = np.average(highlight_loss) writer.add_scalar('Train/loss', mean_highlight_loss, epoch) # Validation if config.is_validate and epoch % config.validate_interval == 0: avg_map, avg_val_loss = validate(config, model, val_loader) # val avg_map for early stopping early_stopping(avg_map, model, epoch) if early_stopping.early_stop: print("Early stopping") break writer.add_scalar('Val/mAP', avg_map, epoch) writer.add_scalar('Val/Loss', avg_val_loss, epoch) # close summary writer writer.close() return
from torchsummary import summary from models.resnet import * from models.resnext import * from models.densenet import * from utils.arg_utils import * from utils.data_utils import * from utils.progress_utils import progress_bar from utils.earlystopping import EarlyStopping """ arguments """ args = fetch_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' early_stopping = EarlyStopping(args['patience'], verbose=True, delta=args['delta']) """ loading data-set.... """ print("==> loading data-set...") train_loader, classes = gen_train_loader(args['train_path'], args['input_size'], args['train_batch_size']) test_loader, _ = gen_test_loader(args['test_path'], args['input_size'], args['test_batch_size']) print('Task classes are: ', classes) num_classes = len(classes) print(num_classes) """ model
def train( self, train_loader, val_loader=None, max_epochs=1000, enable_early_stopping=True ): if val_loader is None: enable_early_stopping = False print() print("-" * 2, "Training Setup", "-" * 2) print(f"Maximum Epochs: {max_epochs}") print(f"Enable Early Stoping: {enable_early_stopping}") print("-" * 20) print("*Start Training.") # model setup self.model.train().to(self.device) if self.multi_gpus and torch.cuda.device_count() > 1: print(f"*Using {torch.cuda.device_count()} GPUs!") self.model = nn.DataParallel(self.model) # early stopping instance if enable_early_stopping: if self.early_stopping is None: self.early_stopping = EarlyStopping(patience=5) else: self.early_stopping.reset_counter() # training start! for epoch in range(1, max_epochs + 1): running_loss = 0.0 for step, data in enumerate(train_loader, start=1): inputs, labels = data inputs, labels = inputs.to(self.device), labels.to(self.device) # Zero the parameter gradients self.optimizer.zero_grad() # forward + backward + optimize outputs = self.model(inputs) loss = self.loss_func(outputs, labels) loss.backward() self.optimizer.step() # print statistics running_loss += loss.item() if step % 100 == 0 or step == len(train_loader): print( f"[{epoch}/{max_epochs}, {step}/{len(train_loader)}] loss: {running_loss / step :.3f}" ) # train & validation loss train_loss = running_loss / len(train_loader) if val_loader is None: print(f"train loss: {train_loss:.3f}") else: # FIXME: fixed the problem that first validation is not correct val_loss = self.validation(val_loader) print(f"train loss: {train_loss:.3f}, val loss: {val_loss:.3f}") if enable_early_stopping: self.early_stopping(self.model, val_loss, self.optimizer) if self.early_stopping.get_early_stop() == True: print("*Early Stopping.") break print("*Finished Training!") if enable_early_stopping: checkpoint = self.early_stopping.get_checkpoint() else: checkpoint = Checkpoint() checkpoint.tmp_save(self.model, self.optimizer, epoch, val_loss) self.checkpoint = checkpoint self.model = checkpoint.load(self.model, self.optimizer)["model"] return self.model
class ModelWrapper(DefaultSetting): def __init__( self, model, loss_func=None, optimizer=None, device=None, multi_gpus=True, log=100, ): super().__init__(device, loss_func) self.model = model if optimizer is None: self.optimizer = self.default_optimizer(model) else: self.optimizer = optimizer self.multi_gpus = multi_gpus self.log = log self.checkpoint = None self.early_stopping = None # TODO: haven't check this this function (__call__) yet # update model setting def __call__( self, model=None, loss_func=None, optimizer=None, device=None, multi_gpus=None, log=None, ): if model is not None: self.model = model if optimizer is None: self.optimizer = self.default_optimizer(self.model) else: self.optimizer = optimizer if loss_func is not None: self.loss_func = loss_func if device is not None: self.device = device if multi_gpus is not None: self.multi_gpus = multi_gpus if self.log is not None: self.log = log self.checkpoint = None self.early_stopping = None # train model def train( self, train_loader, val_loader=None, max_epochs=1000, enable_early_stopping=True ): if val_loader is None: enable_early_stopping = False print() print("-" * 2, "Training Setup", "-" * 2) print(f"Maximum Epochs: {max_epochs}") print(f"Enable Early Stoping: {enable_early_stopping}") print("-" * 20) print("*Start Training.") # model setup self.model.train().to(self.device) if self.multi_gpus and torch.cuda.device_count() > 1: print(f"*Using {torch.cuda.device_count()} GPUs!") self.model = nn.DataParallel(self.model) # early stopping instance if enable_early_stopping: if self.early_stopping is None: self.early_stopping = EarlyStopping(patience=5) else: self.early_stopping.reset_counter() # training start! for epoch in range(1, max_epochs + 1): running_loss = 0.0 for step, data in enumerate(train_loader, start=1): inputs, labels = data inputs, labels = inputs.to(self.device), labels.to(self.device) # Zero the parameter gradients self.optimizer.zero_grad() # forward + backward + optimize outputs = self.model(inputs) loss = self.loss_func(outputs, labels) loss.backward() self.optimizer.step() # print statistics running_loss += loss.item() if step % 100 == 0 or step == len(train_loader): print( f"[{epoch}/{max_epochs}, {step}/{len(train_loader)}] loss: {running_loss / step :.3f}" ) # train & validation loss train_loss = running_loss / len(train_loader) if val_loader is None: print(f"train loss: {train_loss:.3f}") else: # FIXME: fixed the problem that first validation is not correct val_loss = self.validation(val_loader) print(f"train loss: {train_loss:.3f}, val loss: {val_loss:.3f}") if enable_early_stopping: self.early_stopping(self.model, val_loss, self.optimizer) if self.early_stopping.get_early_stop() == True: print("*Early Stopping.") break print("*Finished Training!") if enable_early_stopping: checkpoint = self.early_stopping.get_checkpoint() else: checkpoint = Checkpoint() checkpoint.tmp_save(self.model, self.optimizer, epoch, val_loss) self.checkpoint = checkpoint self.model = checkpoint.load(self.model, self.optimizer)["model"] return self.model # %% validation @torch.no_grad() def validation(self, val_loader): self.model.eval().to(self.device) running_loss = 0.0 for data in val_loader: inputs, labels = data inputs, labels = inputs.to(self.device), labels.to(self.device) outputs = self.model(inputs) loss = self.loss_func(outputs, labels) running_loss += loss.item() return running_loss / len(val_loader) # classification report of the model on test data @torch.no_grad() def classification_report( self, test_loader, target_names=None, binary=False, visualize=False ): print("-" * 10, "Classification Report", "-" * 10) print(f"loss: {self.validation(test_loader)}") model = self.model model.eval().to(self.device) y_pred, y_true = [], [] for data in test_loader: inputs, labels = data inputs, labels = inputs.to(self.device), labels.to(self.device).long() outputs = model(inputs) if not binary: _, predicted = torch.max(outputs, 1) else: predicted = torch.round(outputs) y_true += labels.squeeze().cpu().tolist() y_pred += predicted.squeeze().cpu().tolist() if visualize: vis = Visualization(y_true, y_pred, target_names) vis.confusion_matrix() vis.classification_report() vis.show() report = classification_report(y_true, y_pred, target_names=target_names) print(report) return report
def trainNet(model, train_loader, val_loader, device, adj, nn_ixs, edge_index, config, log_dir, coords=None): """ Args: model: train_loader: val_loader: device: adj: nn_ixs: edge_index: config: log_dir: coords: Returns: """ # define the optimizer & learning rate optim = torch.optim.Adam(model.parameters(), **config['optimizer']) # scheduler = StepLR(optim, step_size=config['lr_step_size'], gamma=config['lr_gamma']) writer = Visualizer(log_dir) # dump config file with open(os.path.join(log_dir, 'config.json'), 'w') as fp: json.dump(config, fp) # Time for printing training_start_time = time.time() globaliter = 0 # initialize the early_stopping object early_stopping = EarlyStopping(log_dir, patience=config['patience'], verbose=True) # adj = adj.to(device) batch_size = config['dataloader']['batch_size'] print_every_step = config['print_every_step'] # Loop for n_epochs for epoch_idx, epoch in enumerate(range(config['num_epochs'])): writer.write_lr(optim, globaliter) # train for one epoch globaliter = train(model=model, train_loader=train_loader, optim=optim, device=device, writer=writer, epoch=epoch, globaliter=globaliter, adj=adj, nn_ixs=nn_ixs, edge_index=edge_index, batch_size=batch_size, coords=coords, print_every_step=print_every_step) # At the end of the epoch, do a pass on the validation set # val_loss = validate(model, val_loader, device, writer, globaliter, adj, nn_ixs, edge_index) val_loss = validate(model=model, val_loader=val_loader, device=device, adj=adj, nn_ixs=nn_ixs, edge_index=edge_index, batch_size=batch_size, coords=coords, writer=writer, globaliter=globaliter) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(val_loss, model) if early_stopping.early_stop: print("Early stopping") break if config['debug'] and epoch_idx >= 0: break # scheduler.step() print("Training finished, took {:.2f}s".format(time.time() - training_start_time)) # remember to close writer writer.close()
def train(self, optimizer, criterion, batch_size=1, epochs=1, kfold=2, iteration=1, shuffle=True, random_state=None, filepath=None, patience=7): best_state = [] best_accuracy = 0. _kfold = KFold(n_splits=kfold, shuffle=shuffle, random_state=random_state) _data = self.dataset.data.numpy() if isinstance( self.dataset.data, torch.Tensor) else self.dataset.data _label = self.dataset.label minimum_early_stopping_epochs = 10 result = np.zeros((iteration, kfold), dtype=np.float) for iter_index in range(iteration): for fold_index, (train_idx, test_idx) in enumerate(_kfold.split(_data)): print("=" * 12) print("Iter {} Fold {}".format(iter_index, fold_index)) print("=" * 12) _model = self.model _model.load_state_dict(self.reset_state) x_train_fold = torch.from_numpy(_data[train_idx]).float() x_test_fold = torch.from_numpy(_data[test_idx]).float() y_train_fold = torch.from_numpy(_label[train_idx]) y_test_fold = torch.from_numpy(_label[test_idx]) train_data = TensorDataset(x_train_fold, y_train_fold) test_data = TensorDataset(x_test_fold, y_test_fold) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False) early_stopping = EarlyStopping(patience=patience) for epoch in range(epochs): _model.train() for index, (data, label) in enumerate(train_loader): data, label = data.to(self.DEVICE), label.to( self.DEVICE) optimizer.zero_grad() output = _model(data) loss = criterion(output, label) loss.backward() optimizer.step() print( "Epoch{} Training {:5.2f}% | Loss: {:.4f}".format( epoch, (index + 1) * batch_size / len(train_loader.dataset) * 100., loss.item()), end='\r') #print(_model.output_layer.weight.grad) _model.eval() test_loss = 0. correct = 0 with torch.no_grad(): for index, (data, label) in enumerate(test_loader): data, label = data.to(self.DEVICE), label.to( self.DEVICE) output = _model(data) loss = criterion(output, label) test_loss += loss.item() # Loss history? pred = output.data.max(1, keepdim=True)[1] correct += pred.eq( label.data.view_as(pred)).cpu().sum() print("Testing... {:5.2f}%".format( (index + 1) * batch_size / len(test_loader.dataset)), end='\r') test_loss /= len(test_loader.dataset) accuracy = correct / float(len(test_loader.dataset)) result[iter_index, fold_index] = accuracy print( "Epoch{} Test Result: loss {:.4f} | accuracy {:.5f}({}/{})" .format(epoch, test_loss, accuracy, correct, len(test_loader.dataset))) if filepath is not None: if not os.path.isdir(filepath): os.mkdir(filepath) torch.save( _model.state_dict(), os.path.join( filepath, f"model{iter_index}_{fold_index}_" + datetime.datetime.now().strftime( "%m%d_%H:%M:%S"))) if epoch >= minimum_early_stopping_epochs: early_stopping(test_loss) if early_stopping.early_stop: print("Early stopping") break iter_accuracy = result[iter_index].mean() if (iter_accuracy > best_accuracy): best_state = _model.state_dict() best_accuracy = iter_accuracy print('=' * 12) print( "Iteration {} complete with {:5.2f}% average accuracy".format( iter_index, iter_accuracy * 100.)) print('=' * 12) print("Training complete with {:5.2f}%".format(result.mean())) self.model.load_state_dict(best_state) return result
train_dataset = CTScanDataset(train_path, transform=composed) trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_dataset = CTScanDataset(val_path, transform=composed) valloader = DataLoader(val_dataset, batch_size=val_batch_size) dev = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") model = Register3d(trainloader[0].size, device=dev, linear=affine_transform) model.to(dev) loss_func = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=50, verbose=True) early_stop = EarlyStopping(patience=100, verbose=True) for epoch in range(num_epochs): train_bar = tqdm(trainloader) for source, target in train_bar: source, target = source.to(dev), target.to(dev) optimizer.zero_grad() model.train() if affine_transform: output, deform_grads, theta = model(source, target) else: output, deform_grads = model(source, target) loss = loss_func(output, target) if affine_transform: loss += alpha*torch.sum(torch.abs(theta-torch.eye(3, 4)))
def run(self): while self.processed_steps < self.num_of_steps: self.esti_variance_every_steps = _data_segment(self.corpus) print('*'*80) need_reload = False self.corpus.invoke_train_batches_making() variance_earlystopper = EarlyStopping( tolerance=self.tolerance, scorers=[VarianceScorer()] ) if len(self.replicas) < self.num_of_devices: self.replicas.clear() self.parallel_model() while self.processed_steps < self.num_of_steps and not need_reload: next_batches = self.corpus.get_train_batches(self.buffer_every_steps) for batch in next_batches: time_start = time.time() self.train_step(batch) # worker training self.update() # worker collection and sync self.time_sum += time.time() - time_start ' estimate variance begin ' if self.processed_steps % self.esti_variance_every_steps == 0: with torch.no_grad(): print('*' * 80) print('Variance Estimating...') torch.cuda.empty_cache() variance = self.esti_variance_step() torch.cuda.empty_cache() # Run variance converge computer (use patience mechanism) variance_earlystopper(variance, self.processed_steps) # If the patience has reached the limit, upgrade the model competence level if variance_earlystopper.has_stopped(): self.upgrade_competence() need_reload = True break print('Training') print(self.annotate) ' estimate variance end ' if self.processed_steps % self.report_every_steps == 0: self.report() if self.processed_steps % self.eval_every_steps == 0: with torch.no_grad(): print('*' * 80) print('Evaluating') torch.cuda.empty_cache() for model in self.replicas: model.eval() self.eval_step() self.save() torch.cuda.empty_cache() for model in self.replicas: model.train() print('Training') print(self.annotate) if self.processed_steps >= self.num_of_steps: print('End of train.') return return
def train_test(self): # load model if model exists weigh initialization if self.config.load_model is True: self.load_model() # self.load_spec_model() else: self.weight_init() # loss function if self.config.gpu_mode: self.model.cuda() self.MSE_loss = nn.MSELoss().cuda() # 默认算出来是对每个sample的平均 else: self.MSE_loss = nn.MSELoss() # optimizer self.momentum = 0.9 self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=1.0) scheduler = lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.1) # scheduler = lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9) print('---------- Networks architecture -------------') utils.print_network(self.model) print('----------------------------------------------') # load dataset train_data_loader = self.data_train test_data_loader = self.data_test ################# Train ################# print('Training is started.') avg_loss = [] avg_loss_test = [] avg_loss_log_test = [] # step = 0 es = EarlyStopping(patience=50) self.model.train( ) # It just sets the training mode.model.eval() to set testing mode for epoch in range(self.config.num_epochs): scheduler.step() epoch_loss = 0 for iter, (input, target, groundtruth) in enumerate(train_data_loader): # input data (low resolution image) if self.config.gpu_mode: x_ = Variable(input.cuda()) y_ = Variable(groundtruth.cuda()) else: x_ = Variable(input) y_ = Variable(groundtruth) # scale是10的话,x_.shape is (batchsize, 1, 300) # scale是100的话,x_.shape is (batchsize, 1, 30) # update network self.optimizer.zero_grad() model_out = self.model(x_) loss = torch.sqrt(self.MSE_loss(model_out, y_)) loss.backward() # 结果得到是tensor self.optimizer.step() epoch_loss += loss # 注意:len(train_data_loader) 是 # train samples/batchsize,有多少个train_data_loader即需要iter多少个batch print("Epoch: [%2d] [%4d/%4d] loss: %.8f" % ((epoch + 1), (iter + 1), len(train_data_loader), loss)) # tensorboard logging # self.logger.scalar_summary('loss', loss, step + 1) # step += 1 # avg. loss per epoch # 如果除以len(train_data_loader)是平均每一个sample的loss avg_loss.append( (epoch_loss / len(train_data_loader)).detach().cpu().numpy()) if (epoch + 1) % self.config.save_epochs == 0: self.save_model(epoch + 1) # caculate test loss with torch.no_grad(): loss_test, _ = self.test(test_data_loader) epoch_loss_test = loss_test / len(test_data_loader) avg_loss_test.append(float(epoch_loss_test)) #nni.report_intermediate_result( # {"default": float(epoch_loss_test), "epoch_loss": float(avg_loss[-1])}) # if es.step(avg_loss[-1]): # self.save_model(epoch=None) # print('Early stop at %2d epoch' % (epoch + 1)) # break if epoch % 10 == 0 and epoch != 0: utils.plot_loss(self.config, [avg_loss, avg_loss_test]) #nni.report_final_result({"default": float(avg_loss_test[-1]), "epoch_loss": float(avg_loss[-1])}) # Plot avg. loss utils.plot_loss(self.config, [avg_loss, avg_loss_test]) with torch.no_grad(): _, dtw_test = self.test(test_data_loader, True) avg_dtw_test = dtw_test / len(test_data_loader) print('avg_loss: ', avg_loss[-1]) print('avg_loss_log with original data: ', avg_loss_test[-1]) print('dtw with original data: ', avg_dtw_test) print("Training and test is finished.") # Save final trained parameters of model self.save_model(epoch=None)
## Training the model-------------------------------------------------------------------------------- n_epochs = 150 patience = 5 #used for early stopping # optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001) optimizer = RAdam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-5, degenerated_to_sgd=True) #Rectified Adam # optimizer = Lookahead(base_optimizer,1e-3 ,k = 6) train_losses = [] val_losses = [] early_stopping = EarlyStopping(patience=patience, verbose=True, delta=0.005, diff=0.05) valid_loss_min = np.Inf epoch_tqdm = tqdm(total=n_epochs, desc='epochs') for epoch in range(n_epochs): train_tqdm = tqdm(total=len(train_loader), desc='training batch') ################### # train the model # ################### model.train() for batch_idx, (image, boxes, label) in enumerate(train_loader): if train_on_gpu: image = image.cuda() model = model.cuda() optimizer.zero_grad() output = model.forward(image)