train_loader = DataLoader(train_dataset, cfg['batch_size'], shuffle=True, num_workers=1) if not cfg['use_both']: val_loader = DataLoader(val_dataset, cfg['batch_size'], shuffle=True, num_workers=1) print('Creating Model...') net = models[cfg['model_name']](cfg).to(device) net = nn.DataParallel(net) n_params = count_parameters(net) print("model: {:,} M parameters".format(n_params / 1024 / 1024)) criterion = bce_with_logits#nn.CrossEntropyLoss() optimizer = optim.Adamax(net.parameters(), lr=cfg['init_lr']) sched = LambdaLR(optimizer, lr_lambda=lr_schedule_func_builder()) checkpoint_path = 'checkpoint/{}'.format(cfg_name) if os.path.exists(checkpoint_path) is False: os.mkdir(checkpoint_path) if cfg['train_check_point'] is not None: net_checkpoint = torch.load(cfg['train_check_point']) net.load_state_dict(net_checkpoint) optim_checkpoint = torch.load('optim_{}'.format(cfg['train_check_point'])) optimizer.load_state_dict(optim_checkpoint) logger = Logger(os.path.join(checkpoint_path, "log.txt")) for k, v in cfg.items(): logger.write(k+': {}'.format(v))
if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, eps=1e-9, weight_decay=args.l2_norm, betas=[0.9, 0.98]) elif args.optim == 'sparseadam': optimizer = optim.SparseAdam(model.parameters(), lr=args.lr, eps=1e-9, weight_decay=args.l2_norm, betas=[0.9, 0.98]) elif args.optim == 'adamax': optimizer = optim.Adamax(model.parameters(), lr=args.lr, eps=1e-9, weight_decay=args.l2_norm, betas=[0.9, 0.98]) elif args.optim == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=1e-10, weight_decay=args.l2_norm, momentum=0.9) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.l2_norm, momentum=0.9) # 0.01 elif args.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(),
def __init__(self, opt, embedding=None, state_dict=None): self.opt = opt self.updates = state_dict[ 'updates'] if state_dict and 'updates' in state_dict else 0 self.eval_embed_transfer = True self.train_loss = AverageMeter() self.network = DNetwork(opt, embedding) if state_dict: new_state = set(self.network.state_dict().keys()) for k in list(state_dict['network'].keys()): if k not in new_state: del state_dict['network'][k] for k, v in list(self.network.state_dict().items()): if k not in state_dict['network']: state_dict['network'][k] = v self.network.load_state_dict(state_dict['network']) parameters = [p for p in self.network.parameters() if p.requires_grad] if opt['optimizer'] == 'sgd': self.optimizer = optim.SGD(parameters, opt['learning_rate'], momentum=opt['momentum'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = optim.Adamax(parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adam': self.optimizer = optim.Adam(parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(parameters, opt['learning_rate'], rho=0.95) else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt['fix_embeddings']: wvec_size = 0 else: wvec_size = (opt['vocab_size'] - opt['tune_partial']) * opt['embedding_dim'] if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentioalLR(self.optimizer, gamma=opt.get('lr_gamma', 0.5)) else: milestones = [ int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',') ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.total_param = sum([p.nelement() for p in parameters]) - wvec_size
Nbatches = int(math.ceil(Ntrain / batch_size)) #batch_size is defined above Nepochs = 500 Nrep = 1 #model = conv3DNet(grid_size, Noutputs, batch_size) #model = conv3DNet(grid_size, Noutputs, batch_size) #model = conv3DNet(grid_size, Noutputs, batch_size) #model = conv3DNet(grid_size, Noutputs, batch_size) #model = UnetGenerator_3d(in_dim=1, out_dim=Noutputs, num_filter=4) #model = UnetGenerator_3d_softmax(in_dim=1, out_dim=Noutputs, num_filter=8) model = UnetGenerator_3d_log_softmax(in_dim=1, out_dim=Noutputs, num_filter=4) #optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.90) #optimizer = optim.Adam(model.parameters()) #optimizer = optim.Adagrad(model.parameters()) optimizer = optim.Adamax(model.parameters()) #optimizer = optim.ASGD(model.parameters()) #optimizer = optim.RMSprop(model.parameters()) #optimizer = optim.Rprop(model.parameters()) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=10, verbose=True ) #Reduces the learning rate if it did not decreased by more than 10^-4 in 10 steps train_errors = torch.Tensor(Nepochs).zero_() validation_errors = torch.Tensor(Nepochs).zero_() ep_loss = torch.Tensor(Nepochs).zero_() for i_ep in range(Nepochs): for b_start in range(0, Ntrain, batch_size):
def _opti(self, parameters): return optim.Adamax(parameters)
torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) corpus = Corpus(args.task) model = eval(args.model)(corpus, args) model.train() criterion = nn.NLLLoss() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adamax(parameters, lr=args.lr) if args.cuda: model.cuda() criterion.cuda() start_time = time.time() total_loss = 0 interval = args.interval save_interval = len(corpus.data_all['train']) // args.batch_size best_dev_score = -99999 iterations = args.epochs*len(corpus.data_all['train']) // args.batch_size print('max iterations: '+str(iterations)) count = 0
def run(args, kwargs): print('\nMODEL SETTINGS: \n', args, '\n') print("Random Seed: ", args.manual_seed) # ================================================================================================================== # SNAPSHOTS # ================================================================================================================== args.model_signature = str(datetime.datetime.now())[0:19].replace(' ', '_') args.model_signature = args.model_signature.replace(':', '_') snapshots_path = os.path.join(args.out_dir, 'vae_' + args.dataset + '_') snap_dir = snapshots_path + args.flow + '_' + str(args.gpu_num) if args.flow != 'no_flow': snap_dir += '_' + 'num_flows_' + str(args.num_flows) if args.flow == 'orthogonal': snap_dir = snap_dir + '_num_vectors_' + str(args.num_ortho_vecs) elif args.flow == 'orthogonalH': snap_dir = snap_dir + '_num_householder_' + str(args.num_householder) elif args.flow == 'iaf': snap_dir = snap_dir + '_madehsize_' + str(args.made_h_size) elif args.flow == 'permutation': snap_dir = snap_dir + '_' + 'kernelsize_' + str(args.kernel_size) elif args.flow == 'mixed': snap_dir = snap_dir + '_' + 'num_householder_' + str(args.num_householder) snap_dir = snap_dir + '__' + args.model_signature + '/' args.snap_dir = snap_dir if not os.path.exists(snap_dir): os.makedirs(snap_dir) # SAVING torch.save(args, snap_dir + args.flow + '.config') # ================================================================================================================== # LOAD DATA # ================================================================================================================== train_loader, val_loader, test_loader, args = load_dataset(args, **kwargs) # ================================================================================================================== # SELECT MODEL # ================================================================================================================== # flow parameters and architecture choice are passed on to model through args if args.flow == 'no_flow': model = VAE.VAE(args) elif args.flow == 'planar': model = VAE.PlanarVAE(args) elif args.flow == 'iaf': model = VAE.IAFVAE(args) elif args.flow == 'orthogonal': model = VAE.OrthogonalSylvesterVAE(args) elif args.flow == 'householder': model = VAE.HouseholderSylvesterVAE(args) elif args.flow == 'triangular': model = VAE.TriangularSylvesterVAE(args) else: raise ValueError('Invalid flow choice') if args.cuda: print("Model on GPU") model.cuda() print(model) optimizer = optim.Adamax(model.parameters(), lr=args.learning_rate, eps=1.e-7) # ================================================================================================================== # TRAINING # ================================================================================================================== train_loss = [] val_loss = [] # for early stopping best_loss = np.inf best_bpd = np.inf e = 0 epoch = 0 train_times = [] for epoch in range(1, args.epochs + 1): t_start = time.time() tr_loss = train(epoch, train_loader, model, optimizer, args) train_loss.append(tr_loss) train_times.append(time.time()-t_start) print('One training epoch took %.2f seconds' % (time.time()-t_start)) v_loss, v_bpd = evaluate(val_loader, model, args, epoch=epoch) val_loss.append(v_loss) # early-stopping if v_loss < best_loss: e = 0 best_loss = v_loss if args.input_type != 'binary': best_bpd = v_bpd print('->model saved<-') torch.save(model, snap_dir + args.flow + '.model') # torch.save(model, snap_dir + args.flow + '_' + args.architecture + '.model') elif (args.early_stopping_epochs > 0) and (epoch >= args.warmup): e += 1 if e > args.early_stopping_epochs: break if args.input_type == 'binary': print('--> Early stopping: {}/{} (BEST: loss {:.4f})\n'.format(e, args.early_stopping_epochs, best_loss)) else: print('--> Early stopping: {}/{} (BEST: loss {:.4f}, bpd {:.4f})\n'.format(e, args.early_stopping_epochs, best_loss, best_bpd)) if math.isnan(v_loss): raise ValueError('NaN encountered!') train_loss = np.hstack(train_loss) val_loss = np.array(val_loss) plot_training_curve(train_loss, val_loss, fname=snap_dir + '/training_curve_%s.pdf' % args.flow) # training time per epoch train_times = np.array(train_times) mean_train_time = np.mean(train_times) std_train_time = np.std(train_times, ddof=1) print('Average train time per epoch: %.2f +/- %.2f' % (mean_train_time, std_train_time)) # ================================================================================================================== # EVALUATION # ================================================================================================================== test_score_file = snap_dir + 'test_scores.txt' with open('experiment_log.txt', 'a') as ff: print(args, file=ff) print('Stopped after %d epochs' % epoch, file=ff) print('Average train time per epoch: %.2f +/- %.2f' % (mean_train_time, std_train_time), file=ff) final_model = torch.load(snap_dir + args.flow + '.model') if args.testing: validation_loss, validation_bpd = evaluate(val_loader, final_model, args) test_loss, test_bpd = evaluate(test_loader, final_model, args, testing=True) with open('experiment_log.txt', 'a') as ff: print('FINAL EVALUATION ON VALIDATION SET\n' 'ELBO (VAL): {:.4f}\n'.format(validation_loss), file=ff) print('FINAL EVALUATION ON TEST SET\n' 'NLL (TEST): {:.4f}\n'.format(test_loss), file=ff) if args.input_type != 'binary': print('FINAL EVALUATION ON VALIDATION SET\n' 'ELBO (VAL) BPD : {:.4f}\n'.format(validation_bpd), file=ff) print('FINAL EVALUATION ON TEST SET\n' 'NLL (TEST) BPD: {:.4f}\n'.format(test_bpd), file=ff) else: validation_loss, validation_bpd = evaluate(val_loader, final_model, args) # save the test score in case you want to look it up later. _, _ = evaluate(test_loader, final_model, args, testing=True, file=test_score_file) with open('experiment_log.txt', 'a') as ff: print('FINAL EVALUATION ON VALIDATION SET\n' 'ELBO (VALIDATION): {:.4f}\n'.format(validation_loss), file=ff) if args.input_type != 'binary': print('FINAL EVALUATION ON VALIDATION SET\n' 'ELBO (VAL) BPD : {:.4f}\n'.format(validation_bpd), file=ff)
def train(self): """ Train the model and print out training data. """ # Define Loss function and Optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adamax(self._net.parameters(), lr=0.001) # Use CUDA device if available device = self._set_device() start_time = time.perf_counter() train_loss_history = [] train_acc_history = [] # Train the network for epoch in range(10): running_loss = 0.0 train_loss = 0.0 correct = 0 total = 0 for i, data in enumerate(self._data_loader, 0): # data is a list of [inputs, labels] inputs, labels = data[0].to(device), data[1].to(device) # clear the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = self._net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # calculate training accuracy and loss _, predictions = torch.max(outputs, 1) correct += (predictions == labels).sum().item() total += labels.size(0) train_loss += loss.item() # print loss and accuracy every 500 mini-batches running_loss += loss.item() if i % 500 == 499: print( 'Epoch %d/10, %5d mini-batches, Loss: %.3f, Accuracy: %.3f' % (epoch + 1, i + 1, running_loss / 500, correct / total)) running_loss = 0.0 train_loss_history.append(train_loss / len(self._data_loader)) train_acc_history.append(correct / total) # print training time end_time = time.perf_counter() print( f'Finished training in {(end_time - start_time)/60:.2f} minutes.') # plot training accuracy and loss curve plt.plot(np.array(train_loss_history), 'b', label='Training Loss') plt.plot(np.array(train_acc_history), 'y', label='Training Accuracy') plt.legend() plt.show() self.save_network()
def main_train(epoch_start, epoch_end, train, arg): #%% net_name = arg['net_name'] loss_name = arg['loss_name'] filename = get_filename(net_name, loss_name) print('train model: ' + filename) if epoch_start == epoch_end: print('epoch_end is epoch_start, exist main_train') return #--------------------------------------- device = arg['device'] lr = arg['lr'] norm_type = arg['norm_type'] rand_pad = arg['rand_pad'] #%% num_classes = 9 if norm_type == np.inf: noise_norm = 0.1 max_iter = 1 step = 1.0 elif norm_type == 2: noise_norm = 5.0 max_iter = 1 step = 1.0 #%% loader_train, loader_val = get_dataloader(rand_pad=rand_pad) #%% loss_train_list = [] acc_train_list = [] acc_val_list = [] acc_test_list = [] epoch_save = epoch_start - 1 #%% model = Net(net_name) if epoch_start > 0: print('load', filename + '_epoch' + str(epoch_save) + '.pt') checkpoint = torch.load(filename + '_epoch' + str(epoch_save) + '.pt', map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model_state_dict']) #------------------------ loss_train_list = checkpoint['result']['loss_train_list'] acc_train_list = checkpoint['result']['acc_train_list'] acc_val_list = checkpoint['result']['acc_val_list'] acc_test_list = checkpoint['result']['acc_test_list'] if 'E' in arg.keys(): if arg['E'] is None: arg['E'] = checkpoint['result']['arg']['E'] print('load E') #------------------------ model.to(device) #------------------------ if arg['optimizer'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=lr) elif arg['optimizer'] == 'AdamW': optimizer = optim.AdamW(model.parameters(), lr=lr) elif arg['optimizer'] == 'Adamax': optimizer = optim.Adamax(model.parameters(), lr=lr) elif arg['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.001, nesterov=True) else: raise NotImplementedError('unknown optimizer') #%% for epoch in range(epoch_save + 1, epoch_end): #-------- training -------------------------------- start = time.time() loss_train, acc_train = train(model, device, optimizer, loader_train, epoch, arg) loss_train_list.append(loss_train) acc_train_list.append(acc_train) print('epoch', epoch, 'training loss:', loss_train, 'acc:', acc_train) end = time.time() print('time cost:', end - start) #-------- validation -------------------------------- result_val = test(model, device, loader_val, num_classes=num_classes, class_balanced_acc=True) acc_val_list.append(result_val['acc']) #-------- test -------------------------------- #result_test = test(model, device, loader_test, num_classes=num_classes, class_balanced_acc=True) #acc_test_list.append(result_test['acc']) #--------save model------------------------- result = {} result['arg'] = arg result['loss_train_list'] = loss_train_list result['acc_train_list'] = acc_train_list result['acc_val_list'] = acc_val_list result['acc_test_list'] = acc_test_list if (epoch + 1) % 10 == 0: save_checkpoint(filename + '_epoch' + str(epoch) + '.pt', model, result, epoch) epoch_save = epoch #------- show result ---------------------- #plt.close('all') display.clear_output(wait=False) fig, ax = plot_result(loss_train_list, acc_train_list, acc_val_list, acc_test_list) display.display(fig) fig.savefig(filename + '_epoch' + str(epoch) + '.png') plt.close(fig)
model = Net() model.to(device) if config.LOSS == "l1": loss_function = nn.L1Loss() elif config.LOSS == "vgg": loss_function = loss.VggLoss() elif config.LOSS == "ssim": loss_function = loss.SsimLoss() elif config.LOSS == "l1+vgg": loss_function = loss.CombinedLoss() else: raise ValueError(f"Unknown loss: {config.LOSS}") optimizer = optim.Adamax(model.parameters(), lr=0.001) board_writer = SummaryWriter() # ---------------------------------------------------------------------- def train(epoch): print("===> Training...") before_pass = [p.data.clone() for p in model.parameters()] epoch_loss = 0 for iteration, batch in enumerate(training_data_loader, 1): input, target = batch[0].to(device), batch[1].to(device) optimizer.zero_grad()
if __name__ == '__main__': model = init_model(config.START_FROM_EXISTING_MODEL) if config.LOSS == "l1": loss_function = nn.L1Loss() elif config.LOSS == "vgg": loss_function = loss.VggLoss() elif config.LOSS == "ssim": loss_function = loss.SsimLoss() elif config.LOSS == "l1+vgg": loss_function = loss.CombinedLoss() else: raise ValueError(f"Unknown loss: {config.LOSS}") optimizer = optim.Adamax(model.parameters(), lr=config.LEARNING_RATE[0]) board_writer = SummaryWriter() # ---------------------------------------------------------------------- def train(epoch): print("===> Training...") before_pass = [p.data.clone() for p in model.parameters()] epoch_loss = 0 target_crop = _make_target_crop(config.PATCH_SIZE[0], config.PATCH_SIZE[1], config.CROP_SIZE, config.CROP_SIZE) epoch_lr = config.LEARNING_RATE[-1]
def forward(self, x): x = self.hidden(x) x = self.hidden2(x) x = self.hidden3(x) x = self.sig(x) x = self.output(x) return x in_d, out_d, in_test, out_test = load_data() net = Net() crit = nn.MSELoss() # opt = optim.SGD(params=net.parameters(),lr= 0.01) opt = optim.Adamax(params=net.parameters(), lr=0.002, betas=(0.9, 0.999)) for epoch in range(100): loss_value = 0.0 i = 0 for values in in_d: opt.zero_grad() temp = np.matrix(values) tensorIn = torch.from_numpy(temp).float() outs = net(tensorIn) tensorOut = torch.from_numpy(out_d[i]).float() loss = crit(outs, tensorOut)
def optim_selection(self): if self.config.optim == "Nesterov": return optim.SGD( self.model.parameters(), lr=self.config.lr, momentum=0.9, nesterov=True, weight_decay=0.0001, ) elif self.config.optim == "SGD": # weight_decay = l2 regularization return optim.SGD( self.model.parameters(), lr=self.config.lr, momentum=0.9, nesterov=False, weight_decay=0.0001, ) elif self.config.optim == "Adadelta": # default lr = 1.0 return optim.Adadelta( self.model.parameters(), lr=self.config.lr, rho=0.9, eps=1e-06, weight_decay=1e-6, ) elif self.config.optim == "Adagrad": # default lr = 0.01 return optim.Adagrad( self.model.parameters(), lr=self.config.lr, lr_decay=0, weight_decay=1e-6, initial_accumulator_value=0, eps=1e-10, ) elif self.config.optim == "Adam": # default lr=0.001 return optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=1e-6) elif self.config.optim == "AdamW": # default lr=0.001 return optim.AdamW( self.model.parameters(), lr=self.config.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False, ) elif self.config.optim == "SparseAdam": # default lr = 0.001 return optim.SparseAdam( self.model.parameters(), lr=self.config.lr, betas=(0.9, 0.999), eps=1e-08, ) elif self.config.optim == "Adamax": # default lr=0.002 return optim.Adamax( self.model.parameters(), lr=self.config.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-6, ) elif self.config.optim == "ASGD": return optim.ASGD( self.model.parameters(), lr=self.config.lr, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=1e-6, ) elif self.config.optim == "RMSprop": # default lr=0.01 return optim.RMSprop( self.model.parameters(), lr=self.config.lr, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False, ) elif self.config.optim == "Rprop": # default lr=0.01 return optim.Rprop( self.model.parameters(), lr=self.config.lr, etas=(0.5, 1.2), step_sizes=(1e-06, 50), )
def main(argv): config = Config() config.load_user_config() config.log.info("finish loading user config") train_file = config.args["train_file"] dev_file = config.args["dev_file"] old_glove_file = config.args["glove_file"] new_glove_file = config.args["glove_file"] + ".subset" # TODO(demi): switch "overwrite" to False train_data_raw, dev_data_raw, i2w, w2i, i2c, c2i, new_glove_file, glove_dim, vocab_size, char_vocab_size\ = squad_read_data(config, train_file, dev_file, old_glove_file, new_glove_file, overwrite=True) config.log.info("finish reading squad data in raw formats") config.update_batch([("glove_file", new_glove_file), ("glove_dim", glove_dim), ("vocab_size", vocab_size), ("char_vocab_size", char_vocab_size)]) config.log.warning("reminder: now we only support train/fake mode") assert config.args["mode"] in ["train", "fake"], "mode (%s) not found" % config.args["mode"] train_id_conversion, train_data = make_dataset(config, train_data_raw, w2i, c2i) dev_id_conversion, dev_data = make_dataset(config, dev_data_raw, w2i, c2i) config.log.info("finish making datasets: reformatting raw data") train_data = QnADataset(train_data, config) dev_data = QnADataset(dev_data, config) config.log.info("finish generating datasets") train_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=True, **config.kwargs) dev_loader = torch.utils.data.DataLoader(dev_data, batch_size=1, **config.kwargs) config.log.info("finish generating data loader") model = BiDAF(config, i2w) config.log.info("finish creating model") if config.args["use_cuda"]: model.cuda() # log config and model config.log.info(config.format_string()) config.log.info("model:{}".format(model)) if config.args['optimizer'] == "Adam": optimizer = optim.Adam(model.get_train_parameters(), lr=config.args['lr'], weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "Adamax": optimizer = optim.Adamax(model.get_train_parameters(), lr=config.args['lr'], weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "SGD": optimizer = torch.optim.SGD(model.get_train_parameters(), lr=config.args['lr'], momentum=0.9, weight_decay=config.args['weight_decay']) if config.args['optimizer'] == "Adadelta": optimizer = torch.optim.Adadelta(model.get_train_parameters(), lr=config.args["lr"]) #if config.args['optimizer'] == "Adagrad": config.log.info("model = %s" % model) config.log.info("config = %s" % config.format_string()) trainer = Trainer(config) evaluator = Evaluator(config) """ save model checkpoint """ def save_checkpoint(epoch): checkpoint = {"model_state_dict": model.state_dict(), "config_args" : config.args} if config.args["optimizer"] != "YF": # YF can't save state dict right now checkpoint["optimizer_state_dict"] = optimizer.state_dict() checkpoint_file = config.args["model_dir"] + config.args["model_name"] + "-EPOCH%d" % epoch torch.save(checkpoint, checkpoint_file) config.log.info("saving checkpoint: {}".format(checkpoint_file)) for epoch in range(1, config.args["max_epoch"] + 1): config.log.info("training: epoch %d" % epoch) # QS(demi): do i need to return model & optimizer? model, optimizer, train_avg_loss, train_answer_dict = trainer.run(model, train_id_conversion[0], train_loader, optimizer, mode="train") model, optimizer, dev_avg_loss, dev_answer_dict = trainer.run(model, dev_id_conversion[0], dev_loader, optimizer, mode="dev") # loss is a float tensor with size 1 config.log.info("[EPOCH %d] LOSS = (train)%.5lf | (dev)%.5lf" % (epoch, train_avg_loss[0], dev_avg_loss[0])) answer_filename = "{}/{}-EPOCH{}".format(config.args["model_dir"], config.args["model_name"], epoch) config.log.info("[EVAUATION] TRAIN EVAL") evaluator.eval("official", train_file, train_answer_dict, "{}/answer.train".format(config.args["model_dir"], answer_filename)) config.log.info("[EVAUATION] DEV EVAL") evaluator.eval("official", dev_file, dev_answer_dict, "{}/answer.dev".format(config.args["model_dir"], answer_filename)) save_checkpoint(epoch)
def main(args): device = torch.device("cuda:0") # model hyperparameters dataset = args.dataset batch_size = args.batch_size hps = Hyperparameters(base_dim=args.base_dim, res_blocks=args.res_blocks, bottleneck=args.bottleneck, skip=args.skip, weight_norm=args.weight_norm, coupling_bn=args.coupling_bn, affine=args.affine) scale_reg = 5e-5 # L2 regularization strength # optimization hyperparameters lr = args.lr momentum = args.momentum decay = args.decay # prefix for images and checkpoints filename = 'bs%d_' % batch_size \ + 'normal_' \ + 'bd%d_' % hps.base_dim \ + 'rb%d_' % hps.res_blocks \ + 'bn%d_' % hps.bottleneck \ + 'sk%d_' % hps.skip \ + 'wn%d_' % hps.weight_norm \ + 'cb%d_' % hps.coupling_bn \ + 'af%d' % hps.affine \ # load dataset train_split, val_split, data_info = data_utils.load(dataset) train_loader = torch.utils.data.DataLoader(train_split, batch_size=batch_size, shuffle=True, num_workers=2) val_loader = torch.utils.data.DataLoader(val_split, batch_size=batch_size, shuffle=False, num_workers=2) prior = distributions.Normal( # isotropic standard normal distribution torch.tensor(0.).to(device), torch.tensor(1.).to(device)) flow = realnvp.RealNVP(datainfo=data_info, prior=prior, hps=hps).to(device) optimizer = optim.Adamax(flow.parameters(), lr=lr, betas=(momentum, decay), eps=1e-7) epoch = 0 running_loss = 0. running_log_ll = 0. optimal_log_ll = float('-inf') early_stop = 0 image_size = data_info.channel * data_info.size**2 # full image dimension while epoch < args.max_epoch: epoch += 1 print('Epoch %d:' % epoch) flow.train() for batch_idx, data in enumerate(train_loader, 1): optimizer.zero_grad() x, _ = data # log-determinant of Jacobian from the logit transform x, log_det = data_utils.logit_transform(x) x = x.to(device) log_det = log_det.to(device) # log-likelihood of input minibatch log_ll, weight_scale = flow(x) log_ll = (log_ll + log_det).mean() # add L2 regularization on scaling factors loss = -log_ll + scale_reg * weight_scale running_loss += loss.item() running_log_ll += log_ll.item() loss.backward() optimizer.step() if batch_idx % 10 == 0: bit_per_dim = (-log_ll.item() + np.log(256.) * image_size) \ / (image_size * np.log(2.)) print('[%d/%d]\tloss: %.3f\tlog-ll: %.3f\tbits/dim: %.3f' % \ (batch_idx*batch_size, len(train_loader.dataset), loss.item(), log_ll.item(), bit_per_dim)) mean_loss = running_loss / batch_idx mean_log_ll = running_log_ll / batch_idx mean_bit_per_dim = (-mean_log_ll + np.log(256.) * image_size) \ / (image_size * np.log(2.)) print('===> Average train loss: %.3f' % mean_loss) print('===> Average train log-likelihood: %.3f' % mean_log_ll) print('===> Average train bit_per_dim: %.3f' % mean_bit_per_dim) running_loss = 0. running_log_ll = 0. flow.eval() with torch.no_grad(): for batch_idx, data in enumerate(val_loader, 1): x, _ = data x, log_det = data_utils.logit_transform(x) x = x.to(device) log_det = log_det.to(device) # log-likelihood of input minibatch log_ll, weight_scale = flow(x) log_ll = (log_ll + log_det).mean() # add L2 regularization on scaling factors loss = -log_ll + scale_reg * weight_scale running_loss += loss.item() running_log_ll += log_ll.item() mean_loss = running_loss / batch_idx mean_log_ll = running_log_ll / batch_idx mean_bit_per_dim = (-mean_log_ll + np.log(256.) * image_size) \ / (image_size * np.log(2.)) print('===> Average validation loss: %.3f' % mean_loss) print('===> Average validation log-likelihood: %.3f' % mean_log_ll) print('===> Average validation bits/dim: %.3f' % mean_bit_per_dim) running_loss = 0. running_log_ll = 0. samples = flow.sample(args.sample_size) samples, _ = data_utils.logit_transform(samples, reverse=True) utils.save_image( utils.make_grid(samples), './samples/' + dataset + '/' + filename + '_ep%d.png' % epoch) if mean_log_ll > optimal_log_ll: early_stop = 0 optimal_log_ll = mean_log_ll torch.save(flow, './models/' + dataset + '/' + filename + '.model') print('[MODEL SAVED]') else: early_stop += 1 if early_stop >= 100: break print('--> Early stopping %d/100 (BEST validation log-likelihood: %.3f)' \ % (early_stop, optimal_log_ll)) print('Training finished at epoch %d.' % epoch)
def train(): parser = argparse.ArgumentParser() # 配置文件 parser.add_argument( "--config-yml", default="exp_fvqa/exp2.yml", help= "Path to a config file listing reader, model and solver parameters.") parser.add_argument("--cpu-workers", type=int, default=8, help="Number of CPU workers for dataloader.") parser.add_argument( "--save-dirpath", default="fvqa/exp_data/checkpoints", help= "Path of directory to create checkpoint directory and save checkpoints." ) parser.add_argument( "--load-pthpath", default="", help="To continue training, path to .pth file of saved checkpoint.") parser.add_argument("--gpus", default="", help="gpus") parser.add_argument( "--overfit", action="store_true", help="Whether to validate on val split after every epoch.") parser.add_argument( "--validate", action="store_true", help="Whether to validate on val split after every epoch.") args = parser.parse_args() # set mannual seed torch.manual_seed(10) torch.cuda.manual_seed(10) cudnn.benchmark = True cudnn.deterministic = True config = yaml.load(open(args.config_yml)) device = torch.device("cuda:0") if args.gpus != "cpu" else torch.device( "cpu") # Print config and args. print(yaml.dump(config, default_flow_style=False)) for arg in vars(args): print("{:<20}: {}".format(arg, getattr(args, arg))) print('Loading TrainDataset...') train_dataset = FvqaTrainDataset(config, overfit=args.overfit) train_dataloader = DataLoader(train_dataset, batch_size=config['solver']['batch_size'], num_workers=args.cpu_workers, shuffle=True, collate_fn=collate_fn) if args.validate: print('Loading TestDataset...') val_dataset = FvqaTestDataset(config, overfit=args.overfit) val_dataloader = DataLoader(val_dataset, batch_size=config['solver']['batch_size'], num_workers=args.cpu_workers, shuffle=True, collate_fn=collate_fn) print('Loading glove...') que_vocab = Vocabulary(config['dataset']['word2id_path']) glove = np.load(config['dataset']['glove_vec_path']) glove = torch.Tensor(glove) print('Building Model...') model = CMGCNnet(config, que_vocabulary=que_vocab, glove=glove, device=device) if torch.cuda.device_count() > 1 and args.gpus != "cpu": print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model = model.to(device) print(model) iterations = len(train_dataset) // config["solver"]["batch_size"] + 1 def lr_lambda_fun(current_iteration: int) -> float: current_epoch = float(current_iteration) / iterations if current_epoch <= config["solver"]["warmup_epochs"]: alpha = current_epoch / float(config["solver"]["warmup_epochs"]) return config["solver"]["warmup_factor"] * (1. - alpha) + alpha else: idx = bisect(config["solver"]["lr_milestones"], current_epoch) return pow(config["solver"]["lr_gamma"], idx) optimizer = optim.Adamax(model.parameters(), lr=config["solver"]["initial_lr"]) scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda_fun) T = iterations * (config["solver"]["num_epochs"] - config["solver"]["warmup_epochs"] + 1) scheduler2 = lr_scheduler.CosineAnnealingLR( optimizer, int(T), eta_min=config["solver"]["eta_min"], last_epoch=-1) summary_writer = SummaryWriter(log_dir=args.save_dirpath) checkpoint_manager = CheckpointManager(model, optimizer, args.save_dirpath, config=config) if args.load_pthpath == "": start_epoch = 0 else: start_epoch = int(args.load_pthpath.split("_")[-1][:-4]) model_state_dict, optimizer_state_dict = load_checkpoint( args.load_pthpath) if isinstance(model, nn.DataParallel): model.module.load_state_dict(model_state_dict) else: model.load_state_dict(model_state_dict) optimizer.load_state_dict(optimizer_state_dict) print("Loading resume model from {}...".format(args.load_pthpath)) global_iteration_step = start_epoch * iterations for epoch in range(start_epoch, config['solver']['num_epochs']): print(f"\nTraining for epoch {epoch}:") train_answers = [] train_preds = [] for i, batch in enumerate(tqdm(train_dataloader)): optimizer.zero_grad() fact_batch_graph = model(batch) batch_loss = cal_batch_loss(fact_batch_graph, batch, device, neg_weight=0.1, pos_weight=0.9) batch_loss.backward() optimizer.step() fact_graphs = dgl.unbatch(fact_batch_graph) for i, fact_graph in enumerate(fact_graphs): train_pred = fact_graph.ndata['h'].squeeze() # (num_nodes,1) train_preds.append(train_pred) # [(num_nodes,)] train_answers.append(batch['facts_answer_id_list'][i]) summary_writer.add_scalar('train/loss', batch_loss, global_iteration_step) summary_writer.add_scalar("train/lr", optimizer.param_groups[0]["lr"], global_iteration_step) summary_writer.add_text('train/loss', str(batch_loss.item()), global_iteration_step) summary_writer.add_text('train/lr', str(optimizer.param_groups[0]["lr"]), global_iteration_step) if global_iteration_step <= iterations * config["solver"][ "warmup_epochs"]: scheduler.step(global_iteration_step) else: global_iteration_step_in_2 = iterations * config["solver"][ "warmup_epochs"] + 1 - global_iteration_step scheduler2.step(int(global_iteration_step_in_2)) global_iteration_step = global_iteration_step + 1 torch.cuda.empty_cache() checkpoint_manager.step() train_acc_1, train_acc_3 = cal_acc( train_answers, train_preds) print( "trainacc@1={:.2%} & trainacc@3={:.2%} " .format(train_acc_1, train_acc_3)) summary_writer.add_scalars( 'train/acc', { 'acc@1': train_acc_1, 'acc@3': train_acc_3 }, epoch) if args.validate: model.eval() answers = [] # [batch_answers,...] preds = [] # [batch_preds,...] print(f"\nValidation after epoch {epoch}:") for i, batch in enumerate(tqdm(val_dataloader)): with torch.no_grad(): fact_batch_graph = model(batch) batch_loss = cal_batch_loss(fact_batch_graph, batch, device, neg_weight=0.1, pos_weight=0.9) summary_writer.add_scalar('test/loss', batch_loss, epoch) fact_graphs = dgl.unbatch(fact_batch_graph) for i, fact_graph in enumerate(fact_graphs): pred = fact_graph.ndata['h'].squeeze() # (num_nodes,1) preds.append(pred) # [(num_nodes,)] answers.append(batch['facts_answer_id_list'][i]) acc_1, acc_3 = cal_acc(answers, preds) print("acc@1={:.2%} & acc@3={:.2%} ". format(acc_1, acc_3)) summary_writer.add_scalars('test/acc', { 'acc@1': acc_1, 'acc@3': acc_3 }, epoch) model.train() torch.cuda.empty_cache() print('Train finished !!!') summary_writer.close()
def train(config_path): logger.info('------------MODEL TRAIN--------------') logger.info('loading config file...') global_config = read_config(config_path) # set random seed seed = global_config['global']['random_seed'] torch.manual_seed(seed) #set default gpu os.environ["CUDA_VISIBLE_DEVICES"] = str(global_config['train']["gpu_id"]) enable_cuda = global_config['train']['enable_cuda'] device = torch.device("cuda" if enable_cuda else "cpu") if torch.cuda.is_available() and not enable_cuda: logger.warning("CUDA is avaliable, you can enable CUDA in config file") elif not torch.cuda.is_available() and enable_cuda: raise ValueError( "CUDA is not abaliable, please unable CUDA in config file") logger.info('reading squad dataset...') dataset = SquadDataset(global_config) logger.info('constructing model...') model_choose = global_config['global']['model'] dataset_h5_path = global_config['data']['dataset_h5'] if model_choose == 'base': model_config = read_config('config/base_model.yaml') model = BaseModel(dataset_h5_path, model_config) elif model_choose == 'match-lstm': model = MatchLSTM(dataset_h5_path) elif model_choose == 'match-lstm+': model = MatchLSTMPlus(dataset_h5_path, global_config['preprocess']['use_domain_tag']) elif model_choose == 'r-net': model = RNet(dataset_h5_path) elif model_choose == 'm-reader': model = MReader(dataset_h5_path) else: raise ValueError('model "%s" in config file not recoginized' % model_choose) model = model.to(device) criterion = MyNLLLoss() # optimizer optimizer_choose = global_config['train']['optimizer'] optimizer_lr = global_config['train']['learning_rate'] optimizer_param = filter(lambda p: p.requires_grad, model.parameters()) if optimizer_choose == 'adamax': optimizer = optim.Adamax(optimizer_param) elif optimizer_choose == 'adadelta': optimizer = optim.Adadelta(optimizer_param) elif optimizer_choose == 'adam': optimizer = optim.Adam(optimizer_param) elif optimizer_choose == 'sgd': optimizer = optim.SGD(optimizer_param, lr=optimizer_lr) else: raise ValueError('optimizer "%s" in config file not recoginized' % optimizer_choose) # check if exist model weight weight_path = global_config['data']['model_path'] if os.path.exists(weight_path): logger.info('loading existing weight...') weight = torch.load(weight_path, map_location=lambda storage, loc: storage) if enable_cuda: weight = torch.load( weight_path, map_location=lambda storage, loc: storage.cuda()) # weight = pop_dict_keys(weight, ['pointer', 'init_ptr_hidden']) # partial initial weight model.load_state_dict(weight, strict=False) # training arguments logger.info('start training...') train_batch_size = global_config['train']['batch_size'] valid_batch_size = global_config['train']['valid_batch_size'] num_workers = global_config['global']['num_data_workers'] batch_train_data = dataset.get_dataloader_train(train_batch_size, num_workers) batch_dev_data = dataset.get_dataloader_dev(valid_batch_size, num_workers) clip_grad_max = global_config['train']['clip_grad_norm'] best_avg = 0. # every epoch for epoch in range(global_config['train']['epoch']): # train model.train() # set training = True, make sure right dropout sum_loss = train_on_model(model=model, criterion=criterion, optimizer=optimizer, batch_data=batch_train_data, epoch=epoch, clip_grad_max=clip_grad_max, device=device) logger.info('epoch=%d, sum_loss=%.5f' % (epoch, sum_loss)) # evaluate with torch.no_grad(): model.eval() # let training = False, make sure right dropout valid_score_em, valid_score_f1, valid_loss = eval_on_model( model=model, criterion=criterion, batch_data=batch_dev_data, epoch=epoch, device=device) valid_avg = (valid_score_em + valid_score_f1) / 2 logger.info( "epoch=%d, ave_score_em=%.2f, ave_score_f1=%.2f, sum_loss=%.5f" % (epoch, valid_score_em, valid_score_f1, valid_loss)) # save model when best avg score if valid_avg > best_avg: save_model( model, epoch=epoch, model_weight_path=global_config['data']['model_path'], checkpoint_path=global_config['data']['checkpoint_path']) logger.info("saving model weight on epoch=%d" % epoch) best_avg = valid_avg logger.info('pretraining finished.') if global_config['global']['finetune']: batch_train_data = dataset.get_dataloader_train2( train_batch_size, num_workers) batch_dev_data = dataset.get_dataloader_dev2(valid_batch_size, num_workers) for epoch in range(global_config['train']['finetune_epoch']): # train model.train() # set training = True, make sure right dropout sum_loss = train_on_model(model=model, criterion=criterion, optimizer=optimizer, batch_data=batch_train_data, epoch=epoch, clip_grad_max=clip_grad_max, device=device) logger.info('finetune epoch=%d, sum_loss=%.5f' % (epoch, sum_loss)) # evaluate with torch.no_grad(): model.eval() # let training = False, make sure right dropout valid_score_em, valid_score_f1, valid_loss = eval_on_model( model=model, criterion=criterion, batch_data=batch_dev_data, epoch=epoch, device=device) valid_avg = (valid_score_em + valid_score_f1) / 2 logger.info( "finetune epoch=%d, ave_score_em=%.2f, ave_score_f1=%.2f, sum_loss=%.5f" % (epoch, valid_score_em, valid_score_f1, valid_loss)) # save model when best avg score if valid_avg > best_avg: save_model( model, epoch=epoch, model_weight_path=global_config['data']['model_path'], checkpoint_path=global_config['data']['checkpoint_path']) logger.info("saving model weight on epoch=%d" % epoch) best_avg = valid_avg if global_config['global']['finetune2']: batch_train_data = dataset.get_dataloader_train3( train_batch_size, num_workers) batch_dev_data = dataset.get_dataloader_dev3(valid_batch_size, num_workers) for epoch in range(global_config['train']['finetune_epoch2']): # train model.train() # set training = True, make sure right dropout sum_loss = train_on_model(model=model, criterion=criterion, optimizer=optimizer, batch_data=batch_train_data, epoch=epoch, clip_grad_max=clip_grad_max, device=device) logger.info('finetune2 epoch=%d, sum_loss=%.5f' % (epoch, sum_loss)) # evaluate with torch.no_grad(): model.eval() # let training = False, make sure right dropout valid_score_em, valid_score_f1, valid_loss = eval_on_model( model=model, criterion=criterion, batch_data=batch_dev_data, epoch=epoch, device=device) valid_avg = (valid_score_em + valid_score_f1) / 2 logger.info( "finetune2 epoch=%d, ave_score_em=%.2f, ave_score_f1=%.2f, sum_loss=%.5f" % (epoch, valid_score_em, valid_score_f1, valid_loss)) # save model when best avg score if valid_avg > best_avg: save_model( model, epoch=epoch, model_weight_path=global_config['data']['model_path'], checkpoint_path=global_config['data']['checkpoint_path']) logger.info("saving model weight on epoch=%d" % epoch) best_avg = valid_avg logger.info('finished.')
{"params": model.parameters()}, optim.Adagrad(lr=0.1, params=model.parameters()), id="AdagradConf", ), pytest.param( "Adam", {"lr": 0.1}, {"params": model.parameters()}, optim.Adam(lr=0.1, params=model.parameters()), id="AdamConf", ), pytest.param( "Adamax", {"lr": 0.1}, {"params": model.parameters()}, optim.Adamax(lr=0.1, params=model.parameters()), id="AdamaxConf", ), pytest.param( "AdamW", {"lr": 0.1}, {"params": model.parameters()}, optim.AdamW(lr=0.1, params=model.parameters()), id="AdamWConf", ), pytest.param( "ASGD", {"lr": 0.1}, {"params": model.parameters()}, optim.ASGD(lr=0.1, params=model.parameters()), id="ASGDConf",
def optimization_algorithms(SCI_optimizer, cnn, LR, SCI_SGD_MOMENTUM, REGULARIZATION): if type(SCI_optimizer) is str: if (SCI_optimizer == 'Adam'): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (SCI_optimizer == 'AMSGrad'): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION, amsgrad=True) if (SCI_optimizer == 'AdamW'): optimizer = AdamW(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (SCI_optimizer == 'RMSprop'): optimizer = optim.RMSprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'SparseAdam') or (int(SCI_optimizer) == 4) : #optimizer = optim.SparseAdam(cnn.parameters(), lr=LR) if (SCI_optimizer == 'SGD'): optimizer = optim.SGD(cnn.parameters(), lr=LR, momentum=SCI_SGD_MOMENTUM, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Adadelta'): optimizer = optim.Adadelta(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Rprop'): optimizer = optim.Rprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'Adagrad') or (int(SCI_optimizer) == 7) : # optimizer = optim.Adagrad(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'Adamax'): optimizer = optim.Adamax(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (SCI_optimizer == 'ASGD'): optimizer = optim.ASGD(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) #if (SCI_optimizer == 'LBFGS') or (int(SCI_optimizer) == 10) : #optimizer = optim.LBFGS(cnn.parameters(), lr=LR) else: if (int(SCI_optimizer) == 1): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 2): optimizer = optim.Adam(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION, amsgrad=True) if (int(SCI_optimizer) == 3): optimizer = AdamW(cnn.parameters(), lr=LR, betas=(0.01, 0.999), weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 4): optimizer = optim.RMSprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'SparseAdam') or (int(SCI_optimizer) == 4) : #optimizer = optim.SparseAdam(cnn.parameters(), lr=LR) if (int(SCI_optimizer) == 5): optimizer = optim.SGD(cnn.parameters(), lr=LR, momentum=SCI_SGD_MOMENTUM, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 6): optimizer = optim.Adadelta(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 7): optimizer = optim.Rprop(cnn.parameters(), lr=LR) #if (SCI_optimizer == 'Adagrad') or (int(SCI_optimizer) == 7) : # optimizer = optim.Adagrad(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 8): optimizer = optim.Adamax(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) if (int(SCI_optimizer) == 9): optimizer = optim.ASGD(cnn.parameters(), lr=LR, weight_decay=REGULARIZATION) #if (SCI_optimizer == 'LBFGS') or (int(SCI_optimizer) == 10) : #optimizer = optim.LBFGS(cnn.parameters(), lr=LR) return optimizer
def main(lr, batch_size, epoch, gpu, train_set, valid_set): # ------------- Part for tensorboard -------------- writer = SummaryWriter(comment="_naive_DENET") # ------------- Part for tensorboard -------------- # -------------- Some prepare --------------------- torch.backends.cudnn.enabled = True torch.cuda.set_device(gpu) # torch.set_default_tensor_type('torch.cuda.FloatTensor') # -------------- Some prepare --------------------- BATCH_SIZE = batch_size EPOCH = epoch LEARNING_RATE = lr belta1 = 0.9 belta2 = 0.999 trainset = mydataset(train_set, transform_train) valset = mydataset(valid_set) trainLoader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True) valLoader = torch.utils.data.DataLoader(valset, batch_size=1, shuffle=False) opter = Opter(128, 128, batch_size) SepConvNet = Network(opter).cuda() SepConvNet.apply(weights_init) # SepConvNet.load_state_dict(torch.load('/mnt/hdd/xiasifeng/sepconv/sepconv_mutiscale_LD/SepConv_iter33-ltype_fSATD_fs-lr_0.001-trainloss_0.1497-evalloss_0.1357-evalpsnr_29.6497.pkl')) # SepConvNet_cost = nn.MSELoss().cuda() # SepConvNet_cost = nn.L1Loss().cuda() SepConvNet_cost = sepconv.SATDLoss().cuda() SepConvNet_optimizer = optim.Adamax(SepConvNet.parameters(), lr=LEARNING_RATE, betas=(belta1, belta2)) SepConvNet_schedule = optim.lr_scheduler.ReduceLROnPlateau( SepConvNet_optimizer, factor=0.1, patience=3, verbose=True, min_lr=1e-5) # ---------------- Time part ------------------- start_time = time.time() global_step = 0 # ---------------- Time part ------------------- for epoch in range(0, EPOCH): SepConvNet.train().cuda() cnt = 0 sumloss = 0.0 # The sumloss is for the whole training_set tsumloss = 0.0 # The tsumloss is for the printinterval printinterval = 300 print("---------------[Epoch%3d]---------------" % (epoch + 1)) for imgL, imgR, label in trainLoader: global_step = global_step + 1 cnt = cnt + 1 SepConvNet_optimizer.zero_grad() imgL = var(imgL).cuda() imgR = var(imgR).cuda() label = var(label).cuda() output = SepConvNet(imgL, imgR) loss = SepConvNet_cost(output, label) loss.backward() SepConvNet_optimizer.step() sumloss = sumloss + loss.data.item() tsumloss = tsumloss + loss.data.item() if cnt % printinterval == 0: writer.add_image("Prev image", imgR[0], cnt) writer.add_image("Pred image", output[0], cnt) writer.add_scalar('Train Batch SATD loss', loss.data.item(), int(global_step / printinterval)) writer.add_scalar('Train Interval SATD loss', tsumloss / printinterval, int(global_step / printinterval)) print( 'Epoch [%d/%d], Iter [%d/%d], Time [%4.4f], Batch loss [%.6f], Interval loss [%.6f]' % (epoch + 1, EPOCH, cnt, len(trainset) // BATCH_SIZE, time.time() - start_time, loss.data.item(), tsumloss / printinterval)) tsumloss = 0.0 print('Epoch [%d/%d], iter: %d, Time [%4.4f], Avg Loss [%.6f]' % (epoch + 1, EPOCH, cnt, time.time() - start_time, sumloss / cnt)) # ---------------- Part for validation ---------------- trainloss = sumloss / cnt SepConvNet.eval().cuda() evalcnt = 0 pos = 0.0 sumloss = 0.0 psnr = 0.0 for imgL, imgR, label in valLoader: imgL = var(imgL).cuda() imgR = var(imgR).cuda() label = var(label).cuda() with torch.no_grad(): output = SepConvNet(imgL, imgR) loss = SepConvNet_cost(output, label) sumloss = sumloss + loss.data.item() psnr = psnr + calcPSNR.calcPSNR(output.cpu().data.numpy(), label.cpu().data.numpy()) evalcnt = evalcnt + 1 # ------------- Tensorboard part ------------- writer.add_scalar("Valid SATD loss", sumloss / evalcnt, epoch) writer.add_scalar("Valid PSNR", psnr / valset.__len__(), epoch) # ------------- Tensorboard part ------------- print('Validation loss [%.6f], Average PSNR [%.4f]' % (sumloss / evalcnt, psnr / valset.__len__())) SepConvNet_schedule.step(psnr / valset.__len__()) torch.save( SepConvNet.state_dict(), os.path.join( '.', 'naive_DENET_iter' + str(epoch + 1) + '-ltype_fSATD_fs' + '-lr_' + str(LEARNING_RATE) + '-trainloss_' + str(round(trainloss, 4)) + '-evalloss_' + str(round(sumloss / evalcnt, 4)) + '-evalpsnr_' + str(round(psnr / valset.__len__(), 4)) + '.pkl')) writer.close()
for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'cnn' in key: params += [{'params':[value], 'lr':opt.cnn_learning_rate, 'weight_decay':opt.cnn_weight_decay, 'betas':(opt.cnn_optim_alpha, opt.cnn_optim_beta)}] else: params += [{'params':[value], 'lr':opt.learning_rate, 'weight_decay':opt.weight_decay, 'betas':(opt.optim_alpha, opt.optim_beta)}] print("Use %s as optmization method" %(opt.optim)) if opt.optim == 'sgd': optimizer = optim.SGD(params, momentum=0.9) elif opt.optim == 'adam': optimizer = optim.Adam(params) elif opt.optim == 'adamax': optimizer = optim.Adamax(params) # if opt.cnn_optim == 'sgd': # cnn_optimizer = optim.SGD(cnn_params, momentum=0.9) # else: # cnn_optimizer = optim.Adam(cnn_params) # load optimizer # learning_rate_list = np.linspace(opt.learning_rate, 0.0005, opt.max_epochs) for epoch in range(start_epoch, opt.max_epochs): if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: if (epoch - opt.learning_rate_decay_start) % opt.learning_rate_decay_every == 0: # decay the learning rate. utils.set_lr(optimizer, opt.learning_rate_decay_rate) opt.learning_rate = opt.learning_rate * opt.learning_rate_decay_rate
def main(lr, batch_size, epoch, gpu, train_set, valid_set): # ------------- Part for tensorboard -------------- # writer = SummaryWriter(log_dir='tb/LSTM_ft1') # ------------- Part for tensorboard -------------- torch.backends.cudnn.enabled = True torch.cuda.set_device(gpu) BATCH_SIZE = batch_size EPOCH = epoch LEARNING_RATE = lr belta1 = 0.9 belta2 = 0.999 trainset = vimeodataset(train_set, 'filelist.txt', transform_train) valset = vimeodataset(valid_set, 'test.txt') trainLoader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True) valLoader = torch.utils.data.DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False) assert (len(valset) % BATCH_SIZE == 0) SepConvNet = Network().cuda() # SepConvNet.apply(weights_init) SepConvNet.load_my_state_dict( torch.load( 'ft2_baseline_iter86-ltype_fSATD_fs-lr_0.001-trainloss_0.1249-evalloss_0.1155-evalpsnr_29.9327.pkl', map_location='cuda:%d' % (gpu))) # SepConvNet.load_state_dict(torch.load('beta_LSTM_iter8-ltype_fSATD_fs-lr_0.001-trainloss_0.557-evalloss_0.1165-evalpsnr_29.8361.pkl')) # MSE_cost = nn.MSELoss().cuda() # SepConvNet_cost = nn.L1Loss().cuda() SepConvNet_cost = sepconv.SATDLoss().cuda() SepConvNet_optimizer = optim.Adamax(SepConvNet.parameters(), lr=LEARNING_RATE, betas=(belta1, belta2)) SepConvNet_schedule = optim.lr_scheduler.ReduceLROnPlateau( SepConvNet_optimizer, factor=0.1, patience=3, verbose=True) # ---------------- Time part ------------------- start_time = time.time() global_step = 0 # ---------------- Time part ------------------- # ---------------- Opt part ----------------------- # opter = Opter(gpu) # ------------------------------------------------- for epoch in range(0, EPOCH): SepConvNet.train().cuda() cnt = 0 sumloss = 0.0 # The sumloss is for the whole training_set tsumloss = 0.0 # The tsumloss is for the printinterval printinterval = 100 print("---------------[Epoch%3d]---------------" % (epoch + 1)) for label_list in trainLoader: bad_list = label_list[7:] label_list = label_list[:7] # IPython.embed() # exit() global_step = global_step + 1 cnt = cnt + 1 for i in range(5): imgL = var(bad_list[i]).cuda() imgR = var(bad_list[i + 1]).cuda() label = var(label_list[i + 2]).cuda() poor_label = var(bad_list[i + 2]).cuda() label_L = var(label_list[i]).cuda() SepConvNet_optimizer.zero_grad() if i == 0: output_f, output_b, stat = SepConvNet(imgL, imgR) else: output_f, output_b, stat = SepConvNet( imgL, imgR, res_f, res_b, stat) res_f = poor_label - output_f res_b = imgL - output_b loss = 0.5 * SepConvNet_cost(output_f, label) + 0.5 * SepConvNet_cost( output_b, label_L) if i < 4: loss.backward(retain_graph=True) else: loss.backward() SepConvNet_optimizer.step() sumloss = sumloss + loss.data.item() tsumloss = tsumloss + loss.data.item() if cnt % printinterval == 0: print( 'Epoch [%d/%d], Iter [%d/%d], Time [%4.4f], Batch loss [%.6f], Interval loss [%.6f]' % (epoch + 1, EPOCH, cnt, len(trainset) // BATCH_SIZE, time.time() - start_time, loss.data.item(), tsumloss / printinterval / 5)) tsumloss = 0.0 print('Epoch [%d/%d], iter: %d, Time [%4.4f], Avg Loss [%.6f]' % (epoch + 1, EPOCH, cnt, time.time() - start_time, sumloss / cnt / 5)) # ---------------- Part for validation ---------------- trainloss = sumloss / cnt SepConvNet.eval().cuda() evalcnt = 0 pos = 0.0 sumloss = 0.0 psnr = 0.0 for label_list in valLoader: bad_list = label_list[7:] label_list = label_list[:7] with torch.no_grad(): for i in range(5): imgL = var(bad_list[i]).cuda() imgR = var(bad_list[i + 1]).cuda() label = var(label_list[i + 2]).cuda() poor_label = var(bad_list[i + 2]).cuda() label_L = var(label_list[i]).cuda() if i == 0: output_f, output_b, stat = SepConvNet(imgL, imgR) else: output_f, output_b, stat = SepConvNet( imgL, imgR, res_f, res_b, stat) psnr = psnr + calcPSNR.calcPSNR( output_f.cpu().data.numpy(), label.cpu().data.numpy()) res_f = poor_label - output_f res_b = label_L - output_b loss = SepConvNet_cost(output_f, label) sumloss = sumloss + loss.data.item() evalcnt = evalcnt + 5 # ------------- Tensorboard part ------------- # writer.add_scalar("Valid SATD loss", sumloss / evalcnt, epoch) # writer.add_scalar("Valid PSNR", psnr / valset.__len__(), epoch) # ------------- Tensorboard part ------------- print('Validation loss [%.6f], Average PSNR [%.4f]' % (sumloss / evalcnt, psnr / evalcnt)) SepConvNet_schedule.step(psnr / evalcnt) torch.save( SepConvNet.state_dict(), os.path.join( '.', 'minidual_LSTM_iter' + str(epoch + 1) + '-ltype_fSATD_fs' + '-lr_' + str(LEARNING_RATE) + '-trainloss_' + str(round(trainloss, 4)) + '-evalloss_' + str(round(sumloss / evalcnt, 4)) + '-evalpsnr_' + str(round(psnr / evalcnt, 4)) + '.pkl'))
def lr_lambda_fun(current_iteration: int) -> float: """Returns a learning rate multiplier. Till `warmup_epochs`, learning rate linearly increases to `initial_lr`, and then gets multiplied by `lr_gamma` every time a milestone is crossed. """ current_epoch = float(current_iteration) / iterations if current_epoch < config["solver"]["warmup_epochs"]: alpha = current_epoch / float(config["solver"]["warmup_epochs"]) return config["solver"]["warmup_factor"] * (1.0 - alpha) + alpha else: idx = bisect(config["solver"]["lr_milestones"], current_epoch) return pow(config["solver"]["lr_gamma"], idx) optimizer = optim.Adamax(model.parameters(), lr=config["solver"]["initial_lr"]) scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda_fun) # 可以在一个组里面调节lr参数 # ============================================================================= # SETUP BEFORE TRAINING LOOP # ============================================================================= start_time = datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') checkpoint_dirpath = args.save_dirpath if checkpoint_dirpath == 'checkpoints/': checkpoint_dirpath += '%s+%s/%s' % (config["model"]["encoder"], config["model"]["decoder"], start_time) if args.save_model: summary_writer = SummaryWriter(log_dir=checkpoint_dirpath) checkpoint_manager = CheckpointManager(model, optimizer, checkpoint_dirpath, config=config) sparse_metrics = SparseGTMetrics() ndcg = NDCG()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data', default='./data/') parser.add_argument('--epoch', type=int, default=10000) parser.add_argument('--batch_size', type=int, default=20) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--embed_dim', type=int, default=300) parser.add_argument('--hidden_dim', type=int, default=150) parser.add_argument('--num_layers', type=int, default=1) parser.add_argument('--bidirectional', default=True) parser.add_argument('--glove', default='../Data/glove/glove.840B.300d.txt') parser.add_argument('--cuda_set', default=True) args = parser.parse_args() ############################################################################### # Load data ############################################################################### print("Load data file...") train_data, dev_data = load_data(args.data) print("Preparing batch loader...") print("============= Train ===============") train_loader = BatchLoader(train_data, 'train', args.cuda_set, args.batch_size) print("============= Valid ===============") dev_loader = BatchLoader(dev_data, 'dev', args.cuda_set, args.batch_size) # vocabulary set vocab_size = len(dev_data['word2idx']) print("============= Vocab Size ===============") print(vocab_size) print("") idx2word = dev_data['idx2word'] ############################################################################### # Build the model ############################################################################### cuda.set_device(0) if args.cuda_set == True: model = MatchNet(vocab_size, args.embed_dim, args.hidden_dim, args.cuda_set, args.num_layers, args.bidirectional).cuda() criterion = nn.NLLLoss().cuda() else: model = MatchNet(vocab_size, args.embed_dim, args.hidden_dim, args.cuda_set, args.num_layers, args.bidirectional) criterion = nn.CrossEntropyLoss() optimizer = optim.Adamax(model.parameters(), lr=args.lr, betas=(0.9, 0.999)) print("#" * 15, "Model Info", "#" * 15) print("Model: ", model) print("Criterion: ", criterion) print("Optimizer: ", optimizer) print("") ############################################################################### # Load the pretrained word embedding ############################################################################### print("loading pretrinaed word embedding ...") emb_file = os.path.join(args.data, 'glove_emb.pth') if os.path.isfile(emb_file): W_emb = torch.load(emb_file) else: W_emb, embed_dim = load_pretrained_embedding(dev_data['word2idx'], args.glove) W_emb = torch.from_numpy(W_emb).cuda() torch.save(W_emb, emb_file) if args.cuda_set: W_emb = W_emb.cuda() model.embed.embed.state_dict()['weight'].copy_(W_emb) model.embed.embed.state_dict()['weight'].requires_grad = False ############################################################################### # Training ############################################################################### for epoch in range(args.epoch): start_time = time.time() train_loss = AverageMeter() train_acc = AverageMeter() message = "Epoch: %d training.." % (epoch) print(message) print("") for i, data in enumerate(train_loader): model.train() doc = data[0] qry = data[1] anss = data[2] sis = data[3] eis = data[4] ids = data[5] dm = data[9] qm = data[10] output1, output2 = model(doc, qry, dm, qm) _, pred1 = output1.data.cpu().topk(1) _, pred2 = output2.data.cpu().topk(1) loss1 = criterion(output1, sis) loss2 = criterion(output2, eis) loss = loss1 + loss2 train_loss.update(loss.data[0], doc.size(0)) acc_tmp = accuracy(pred1.numpy(), tensor2np(sis), pred2.numpy(), tensor2np(eis), ids) train_acc.update(acc_tmp, doc.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() print( "=================== Train ======================") print("doc_len: ", doc.size(1)) random_idx = randomChoice(doc.size(0)) show_question(random_idx, doc, qry, dm, qm, anss, idx2word) show_answer(random_idx, doc, qry, pred1, pred2, sis, eis, idx2word) print("") message = "Train epoch: %d iter: %d train_loss: %1.3f train_acc: %1.3f elapsed: %1.3f " % ( epoch, i, train_loss.avg, train_acc.avg, time.time() - start_time) print(message) print("") ############################################################################### # Validation ############################################################################### print( "==================== Evaluation ======================") val_acc = AverageMeter() start_time = time.time() model.eval() cor_cnt = 0 incor_cnt = 0 pad_cnt = 0 val_out = 0 val_in = 0 val_false = 0 for j, data in enumerate(dev_loader): doc = data[0] qry = data[1] anss = data[2] sis = data[3] eis = data[4] ids = data[5] dm = data[9] qm = data[10] output1, output2 = model(doc, qry, dm, qm) _, val_pred1 = output1.data.cpu().topk(1) _, val_pred2 = output2.data.cpu().topk(1) acc_tmp = accuracy_dev(val_pred1.numpy(), sis, val_pred2.numpy(), eis, ids) val_acc.update(acc_tmp, doc.size(0)) message = "Epoch: %d train_iter: %d iter: %d val_acc: %1.3f elapsed: %1.3f " % ( epoch, i, j, val_acc.avg, time.time() - start_time) print(message) print("") ############################################################################### # Show the sample Q&A ############################################################################### random_idx = randomChoice(doc.size(0)) show_question(random_idx, doc, qry, dm, qm, anss, idx2word) show_answer_dev(random_idx, doc, qry, val_pred1, val_pred2, sis, eis, idx2word) train_loss = AverageMeter() train_acc = AverageMeter() start_time = time.time()
def train_model(opt_): env = TrainEnvironment(opt_) dictionary = env.dict if opt_.load_checkpoint: net, dictionary = load_model(opt_.load_checkpoint, opt_) env = TrainEnvironment(opt_, dictionary) env.dict = dictionary else: net = create_model(opt_, dictionary["words"]) if opt_.embeddings and opt_.embeddings != "None": load_embeddings(opt_, dictionary["words"], net) paramnum = 0 trainable = 0 for name, parameter in net.named_parameters(): if parameter.requires_grad: trainable += parameter.numel() paramnum += parameter.numel() print("TRAINABLE", paramnum, trainable) if opt_.cuda: net = torch.nn.DataParallel(net) net = net.cuda() if opt_.optimizer == "adamax": lr = opt_.learning_rate or 0.002 named_params_to_optimize = filter(lambda p: p[1].requires_grad, net.named_parameters()) params_to_optimize = (p[1] for p in named_params_to_optimize) optimizer = optim.Adamax(params_to_optimize, lr=lr) if opt_.epoch_start != 0: saved_params = torch.load( opt_.load_checkpoint, map_location=lambda storage, loc: storage) optimizer.load_state_dict(saved_params["optim_dict"]) else: lr = opt_.learning_rate or 0.01 optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=lr) start_time = time.time() # best_loss = float("+inf") best_loss = 0 test_data_shuffled = env.build_valid_dataloader(True) test_data_not_shuffled = env.build_valid_dataloader(False) with torch.no_grad(): validate( 0, net, test_data_shuffled, nb_candidates=opt_.hits_at_nb_cands, shuffled_str="shuffled", ) train_data = None for epoch in range(opt_.epoch_start, opt_.num_epochs): if train_data is None or opt_.dataset_name == "reddit": train_data = env.build_train_dataloader(epoch) train(epoch, start_time, net, optimizer, opt_, train_data) with torch.no_grad(): # We compute the loss both for shuffled and not shuffled case. # however, the loss that determines if the model is better is the # same as the one used for training. loss_shuffled = validate( epoch, net, test_data_shuffled, nb_candidates=opt_.hits_at_nb_cands, shuffled_str="shuffled", ) loss_not_shuffled = validate( epoch, net, test_data_not_shuffled, nb_candidates=opt_.hits_at_nb_cands, shuffled_str="not-shuffled", ) if opt_.no_shuffle: loss = loss_not_shuffled else: loss = loss_shuffled # if loss < best_loss: # ========= change loss to P@1,100 ======= if loss > best_loss: best_loss = loss best_loss_epoch = epoch # logging.info(f"New best loss, saving model to {opt_.model_file}") logging.info( f"New best P@1,100, saving model to {opt_.model_file}") save_model(opt_.model_file, net, dictionary, optimizer) # Stop if it's been too many epochs since the loss has decreased if opt_.stop_crit_num_epochs != -1: if epoch - best_loss_epoch >= opt_.stop_crit_num_epochs: break return net, dictionary
def train(model, train_loader, test_loader, gen_loader, configs): model.train() # optimizer, it's better to set up lr for some modules separately so that the whole training become more stable params = [{ 'params': model.reader.parameters(), 'lr': 0.2 * configs.lr }, { 'params': model.h_mean.parameters(), 'lr': 0.1 * configs.lr }, { 'params': model.h_var.parameters(), 'lr': 0.1 * configs.lr }, { 'params': model.writer.parameters() }, { 'params': model.pos_dist.parameters() }, { 'params': model.combine.parameters() }, { 'params': model.describe.parameters() }, { 'params': model.box_vae.parameters(), 'lr': 10 * configs.lr }, { 'params': model.offset_vae.parameters(), 'lr': 10 * configs.lr }, { 'params': model.renderer.parameters() }, { 'params': model.bias_mean.parameters() }, { 'params': model.bias_var.parameters() }] if configs.net == 'PNP': params.append({'params': model.vis_dist.parameters()}) elif configs.net == 'SIMPLE': pass else: raise ValueError('configs.net ?= ', configs.net, 'not a valid value') optimizer = optim.Adamax(params, lr=configs.lr) model.cuda() trainer = PNPNetTrainer(model=model, train_loader=train_loader, val_loader=test_loader, gen_loader=gen_loader, optimizer=optimizer, configs=configs) minloss = 1000 for epoch_num in range(0, configs.epochs + 1): timestamp_start = datetime.datetime.now( pytz.timezone('America/New_York')) trainer.train_epoch(epoch_num, timestamp_start) if epoch_num % configs.validate_interval == 0 and epoch_num > 0: minloss = trainer.validate(epoch_num, timestamp_start, minloss) if epoch_num % configs.sample_interval == 0 and epoch_num > 0: trainer.sample(epoch_num, sample_num=8, timestamp_start=timestamp_start) if epoch_num % configs.save_interval == 0 and epoch_num > 0: torch.save( model.state_dict(), osp.join(configs.exp_dir, 'checkpoints', 'model_epoch_{0}.pth'.format(epoch_num)))
if args.cuda != -1: rnn = rnn.cuda(args.cuda) print(rnn) last_save_losses = [] if args.optim == 'adam': optimizer = optim.Adam(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'adamax': optimizer = optim.Adamax(rnn.parameters(), lr=args.lr, eps=1e-9, betas=[0.9, 0.98]) # 0.0001 elif args.optim == 'rmsprop': optimizer = optim.RMSprop(rnn.parameters(), lr=args.lr, momentum=0.9, eps=1e-10) # 0.0001 elif args.optim == 'sgd': optimizer = optim.SGD(rnn.parameters(), lr=args.lr) # 0.01 elif args.optim == 'adagrad': optimizer = optim.Adagrad(rnn.parameters(), lr=args.lr) elif args.optim == 'adadelta': optimizer = optim.Adadelta(rnn.parameters(), lr=args.lr) last_100_losses = []
def main( dataset, dataroot, download, augment, batch_size, eval_batch_size, epochs, saved_model, seed, hidden_channels, K, L, actnorm_scale, flow_permutation, flow_coupling, LU_decomposed, learn_top, y_condition, y_weight, max_grad_clip, max_grad_norm, lr, n_workers, cuda, n_init_batches, output_dir, saved_optimizer, warmup, ): device = "cpu" if (not torch.cuda.is_available() or not cuda) else "cuda:0" check_manual_seed(seed) ds = check_dataset(dataset, dataroot, augment, download) image_shape, num_classes, train_dataset, test_dataset = ds # Note: unsupported for now multi_class = False train_loader = data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=n_workers, drop_last=True, ) test_loader = data.DataLoader( test_dataset, batch_size=eval_batch_size, shuffle=False, num_workers=n_workers, drop_last=False, ) model = Glow( image_shape, hidden_channels, K, L, actnorm_scale, flow_permutation, flow_coupling, LU_decomposed, num_classes, learn_top, y_condition, ) model = model.to(device) optimizer = optim.Adamax(model.parameters(), lr=lr, weight_decay=5e-5) lr_lambda = lambda epoch: min(1.0, (epoch + 1) / warmup) # noqa scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) def step(engine, batch): model.train() optimizer.zero_grad() x, y = batch x = x.to(device) if y_condition: y = y.to(device) z, nll, y_logits = model(x, y) losses = compute_loss_y(nll, y_logits, y_weight, y, multi_class) else: z, nll, y_logits = model(x, None) losses = compute_loss(nll) losses["total_loss"].backward() if max_grad_clip > 0: torch.nn.utils.clip_grad_value_(model.parameters(), max_grad_clip) if max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() return losses def eval_step(engine, batch): model.eval() x, y = batch x = x.to(device) with torch.no_grad(): if y_condition: y = y.to(device) z, nll, y_logits = model(x, y) losses = compute_loss_y(nll, y_logits, y_weight, y, multi_class, reduction="none") else: z, nll, y_logits = model(x, None) losses = compute_loss(nll, reduction="none") return losses trainer = Engine(step) checkpoint_handler = ModelCheckpoint(output_dir, "glow", n_saved=2, require_empty=False) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, { "model": model, "optimizer": optimizer }, ) monitoring_metrics = ["total_loss"] RunningAverage(output_transform=lambda x: x["total_loss"]).attach( trainer, "total_loss") evaluator = Engine(eval_step) # Note: replace by https://github.com/pytorch/ignite/pull/524 when released Loss( lambda x, y: torch.mean(x), output_transform=lambda x: ( x["total_loss"], torch.empty(x["total_loss"].shape[0]), ), ).attach(evaluator, "total_loss") if y_condition: monitoring_metrics.extend(["nll"]) RunningAverage(output_transform=lambda x: x["nll"]).attach( trainer, "nll") # Note: replace by https://github.com/pytorch/ignite/pull/524 when released Loss( lambda x, y: torch.mean(x), output_transform=lambda x: (x["nll"], torch.empty(x["nll"].shape[0])), ).attach(evaluator, "nll") pbar = ProgressBar() pbar.attach(trainer, metric_names=monitoring_metrics) # load pre-trained model if given if saved_model: model.load_state_dict(torch.load(saved_model)) model.set_actnorm_init() if saved_optimizer: optimizer.load_state_dict(torch.load(saved_optimizer)) file_name, ext = os.path.splitext(saved_model) resume_epoch = int(file_name.split("_")[-1]) @trainer.on(Events.STARTED) def resume_training(engine): engine.state.epoch = resume_epoch engine.state.iteration = resume_epoch * len( engine.state.dataloader) @trainer.on(Events.STARTED) def init(engine): model.train() init_batches = [] init_targets = [] with torch.no_grad(): for batch, target in islice(train_loader, None, n_init_batches): init_batches.append(batch) init_targets.append(target) init_batches = torch.cat(init_batches).to(device) assert init_batches.shape[0] == n_init_batches * batch_size if y_condition: init_targets = torch.cat(init_targets).to(device) else: init_targets = None model(init_batches, init_targets) @trainer.on(Events.EPOCH_COMPLETED) def evaluate(engine): evaluator.run(test_loader) scheduler.step() metrics = evaluator.state.metrics losses = ", ".join( [f"{key}: {value:.2f}" for key, value in metrics.items()]) print(f"Validation Results - Epoch: {engine.state.epoch} {losses}") timer = Timer(average=True) timer.attach( trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED, ) @trainer.on(Events.EPOCH_COMPLETED) def print_times(engine): pbar.log_message( f"Epoch {engine.state.epoch} done. Time per batch: {timer.value():.3f}[s]" ) timer.reset() trainer.run(train_loader, epochs)
weight_decay=5e-4) # optimizrにMomentumSGDを指定 optKind2 = 'MomentumSGD' network3 = CNNNet().to(device) # networkにさっき定義したnetworkを代入 optimizer3 = optim.Adam(network3.parameters(), lr=0.001) # optimizrにAdamを指定 optKind3 = 'Adam' network4 = CNNNet().to(device) # networkにさっき定義したnetworkを代入 optimizer4 = optim.RMSprop(network4.parameters(), lr=0.0005, eps=1e-06) # optimizrにRMSpropを指定 optKind4 = 'RMSprop' network5 = CNNNet().to(device) # networkにさっき定義したnetworkを代入 optimizer5 = optim.Adamax(network5.parameters(), lr=0.002, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) # optimizrにAdamaxを指定 optKind5 = 'Adamax' times, test_acc_list, train_loss_lists, train_acc_lists, val_loss_lists, val_acc_lists = [], [], [], [], [], [] Optims = [optKind1, optKind2, optKind3, optKind4, optKind5] Optimizers = [optimizer1, optimizer2, optimizer3, optimizer4, optimizer5] networks = [network1, network2, network3, network4, network5] for i in range(len(Optims)): network = networks[i] optKind = Optims[i] optimizer = Optimizers[i]
def train(data, save_model_dir, seg=True): print("Training with {} model.".format(data.model_type)) #data.show_data_summary() model = SeqModel(data) print( "finish building model.") parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adamax(parameters, lr=data.HP_lr) best_dev = -1 best_dev_p = -1 best_dev_r = -1 best_test = -1 best_test_p = -1 best_test_r = -1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print(("Epoch: %s/%s" %(idx,data.HP_iteration))) optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_loss = 0 batch_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end >train_num: end = train_num instance = data.train_Ids[start:end] words = data.train_texts[start:end] if not instance: continue gaz_list, batch_word, batch_biword, batch_wordlen, batch_label, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask, batch_bert, bert_mask = batchify_with_label(instance, data.HP_gpu,data.HP_num_layer) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss(gaz_list, batch_word, batch_biword, batch_wordlen, layer_gaz, gaz_count,gaz_chars, gaz_mask, gazchar_mask, mask, batch_label, batch_bert, bert_mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data total_loss += loss.data batch_loss += loss if end%500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print((" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))) sys.stdout.flush() sample_loss = 0 if end%data.HP_batch_size == 0: batch_loss.backward() optimizer.step() model.zero_grad() batch_loss = 0 temp_time = time.time() temp_cost = temp_time - temp_start print((" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token)) ) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print(("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss))) speed, acc, p, r, f, pred_labels, gazs = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if seg: current_score = f print(("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f))) else: current_score = acc print(("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc))) if current_score > best_dev: if seg: print( "Exceed previous best f score:", best_dev) else: print( "Exceed previous best acc score:", best_dev) model_name = save_model_dir torch.save(model.state_dict(), model_name) #best_dev = current_score best_dev_p = p best_dev_r = r # ## decode test speed, acc, p, r, f, pred_labels, gazs = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if seg: current_test_score = f print(("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f))) else: current_test_score = acc print(("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))) if current_score > best_dev: best_dev = current_score best_test = current_test_score best_test_p = p best_test_r = r print("Best dev score: p:{}, r:{}, f:{}".format(best_dev_p,best_dev_r,best_dev)) print("Test score: p:{}, r:{}, f:{}".format(best_test_p,best_test_r,best_test)) gc.collect() with open(data.result_file,"a") as f: f.write(save_model_dir+'\n') f.write("Best dev score: p:{}, r:{}, f:{}\n".format(best_dev_p,best_dev_r,best_dev)) f.write("Test score: p:{}, r:{}, f:{}\n\n".format(best_test_p,best_test_r,best_test)) f.close()