def basic_callback_dict(identifier, save="val_loss"): callback_dict = defaultdict(list) path = Path(__file__).resolve().parent / (identifier + ".pt" if identifier else "best_model.pt") if save == "val_loss": ModelSaver(path=path).register(callback_dict) else: raise NotImplemented(f"Saving by {save} not implemented.") return callback_dict
def train(args): Arguments.save_args(args, args.args_path) train_loader, val_loader, _ = get_dataloaders(args) model = UNetVgg16(n_classes=args.n_classes).to(args.device) optimizer = get_optimizer(args.optimizer, model) lr_scheduler = LRScheduler(args.lr_scheduler, optimizer) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) model_saver = ModelSaver(args.model_path) recorder = Recorder(['train_miou', 'train_acc', 'train_loss', 'val_miou', 'val_acc', 'val_loss']) for epoch in range(args.n_epochs): print(f"{args.experim_name} Epoch {epoch+1}:") train_loss, train_acc, train_miou, train_ious = train_epoch( model=model, dataloader=train_loader, n_classes=args.n_classes, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, device=args.device, ) print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}") val_loss, val_scores = eval_epoch( model=model, dataloader=val_loader, n_classes=args.n_classes, criterion=criterion, device=args.device, ) val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy'] print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}") recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss]) recorder.save(args.record_path) if args.metric.startswith("IoU"): metric = val_ious[int(args.metric.split('_')[1])] else: metric = val_miou model_saver.save_models(metric, epoch+1, model, ious={'train': train_ious, 'val': val_ious}) print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
def train(train_sets: tuple, test_sets: tuple, input_shape: tuple = (1, 128, 128, 1), model_version="1.0.0", epochs: int = 100, classes: int = 2, batch_size: int = 1, verbose=1, out_dir: str = "saved_models"): """ The function to train the model. Parameters: train_sets (tuple): A tuple of np.array for train images and train labels. test_sets (tuple): A tuple of np.array for test images and test labels. input shape (tuple): Input shape of the model. It should be in the form of (1, ..., ...). model_version (str): The version of the model in d.d.d format. epochs (int): The number of epochs. classes (int): The number of classes. batch_size (int): The number of batch size. verbose (bool): Wether to show the progress of each epoch. out_dir (str): The output dir for saving the model in. """ (x_train, y_train), (x_test, y_test) = train_sets, test_sets y_train = keras.utils.to_categorical(y_train, classes) y_test = keras.utils.to_categorical(y_test, classes) m = get_model(model_version) if not m: return model = m.build_model(input_shape) model.compile(loss=BinaryCrossentropy(), optimizer=RMSprop(learning_rate=0.0001), metrics=['accuracy']) saver = ModelSaver(out_dir) csv_logger = CSVLogger( "%s/%s/log.csv" % (out_dir, datetime.datetime.now().date().strftime("%Y_%m_%d")), append=True, separator=',') history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_data=(x_test, y_test), callbacks=[saver, csv_logger]) model.save("%s/%s/final.hd5" % (out_dir, datetime.datetime.now().date().strftime("%Y_%m_%d"))) print("Model saved in %s as final.hd5" % out_dir) plot_results(history, epochs, out_dir)
def __init__(self, weights=None, biases=None): self.weights = weights if weights else self.weights self.biases = biases if biases else self.biases self.datasets = get_datasets(heart_diseases, nr_inputs) self.label_data = get_labels(self.datasets) self.saver = ModelSaver(save_dir="saved_models/cnn/") logs_path = "tensorboard_data/cnn/" self.tensorboard_handler = TensorBoardHandler(logs_path) self.tensorboard_handler.add_histograms(self.weights) self.tensorboard_handler.add_histograms(self.biases) self.build()
def build_trainer(config, seed, args): monitoring_metrics = [ 'epoch', 'iteration', 'total_loss', 'latent_loss', 'seg_loss', 'NET', 'ED', 'ET' ] logger = Logger(save_dir=config.save.save_dir, config=config, seed=seed, name=config.save.study_name, monitoring_metrics=monitoring_metrics) save_dir_path = logger.log_dir checkpoint_callback = ModelSaver(limit_num=10, monitor=None, filepath=os.path.join( save_dir_path, 'ckpt-{epoch:04d}-{total_loss:.2f}'), save_top_k=-1) if config.run.resume_checkpoint: print('Training will resume from: {}'.format( config.run.resume_checkpoint)) model = TumorSegmentation.load_from_checkpoint( config.run.resume_checkpoint, config=config, save_dir_path=save_dir_path, ) else: model = TumorSegmentation(config, save_dir_path) trainer = pl.Trainer(gpus=config.run.visible_devices, num_nodes=1, max_epochs=config.run.n_epochs, progress_bar_refresh_rate=1, automatic_optimization=True, distributed_backend=config.run.distributed_backend, deterministic=True, logger=logger, sync_batchnorm=True, checkpoint_callback=checkpoint_callback, resume_from_checkpoint=config.run.resume_checkpoint, limit_val_batches=10) return model, trainer
parser.add_argument('--unfreeze', type=str, metavar='UF', default='', help='Provide an option for unfreezeing given layers') parser.add_argument('--freeze', type=str, metavar='F', default='', help='Provide an option for freezeing given layers') parser.add_argument('--pretrain', action='store_true') parser.add_argument('--fc-only', action='store_true') parser.add_argument('--except-fc', action='store_true') parser.add_argument('--load-best', action='store_true') parser.add_argument('--load-last', action='store_true') parser.add_argument('--continue-step', action='store_true') parser.add_argument('--train-all', action='store_true', help='Train all layers') args = parser.parse_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') l2_dist = PairwiseDistance(2) modelsaver = ModelSaver() def save_if_best(state, acc): modelsaver.save_if_best(acc, state) def main(): init_log_just_created("log/valid.csv") init_log_just_created("log/train.csv") import pandas as pd valid = pd.read_csv('log/valid.csv') max_acc = valid['acc'].max() pretrain = args.pretrain fc_only = args.fc_only
betas=( CONFIG.hyperparam.optimization.beta1, CONFIG.hyperparam.optimization.beta2, ), weight_decay=CONFIG.hyperparam.optimization.weight_decay, ) else: raise NotImplementedError("only Adam implemented") ######################################################### ################# evaluator, saver ###################### print("loading evaluator and model saver...") evaluator = NLGEval(no_skipthoughts=True, no_glove=True) # evaluator = NLGEval(metrics_to_omit=["METEOR"]) model_path = os.path.join(outdir, "best_score.ckpt") saver = ModelSaver(model_path, init_val=0) offset_ep = 1 offset_ep = saver.load_ckpt(model, optimizer, device) if offset_ep > CONFIG.hyperparam.misc.max_epoch: raise RuntimeError( "trying to restart at epoch {} while max training is set to {} \ epochs".format(offset_ep, CONFIG.hyperparam.misc.max_epoch)) ######################################################## if torch.cuda.device_count() > 1: model = nn.DataParallel(model) if CONFIG.use_wandb: wandb.watch(model) ################### training loop #####################
return x model = NIMA() model = model.to(device) ######### # Train # ######### parameters = [ {"params": model.base_model.parameters()}, {"params": model.head.parameters(), "lr": 3e-5}, ] optimizer = torch.optim.Adam(parameters, lr=3e-6) scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.95) saver = ModelSaver() def train(model, train_loader, scheduler, optimizer): criterion = EMDLoss() # r=2 for train logging.info("Train Phase, Epoch: {}".format(epoch)) scheduler.step() emd_losses = AverageMeter() model.train() for batch_num, batch in enumerate(train_loader, 1): imgs, labels = batch imgs, labels = imgs.to(device).float(), labels.to(device).float() scores = model(imgs) emd_loss = criterion(scores, labels)
def run(opt): train_loader, test_loader = create_loaders(opt) # Initialize generator and discriminator generator = load_or_init_models(RetouchGenerator(opt.device, opt.pw_guide), opt) # Optimizers optimizer_G = torch.optim.Adam(generator.parameters(), lr=opt.lr, weight_decay=1e-8) # Losses # criterion_GAN = torch.nn.MSELoss() # criterion_pixelwise = torch.nn.L1Loss() # if opt.cuda: # generator = generator.cuda() # discriminator = discriminator.cuda() generator, criterion_pixelwise = to_variables((generator,torch.nn.MSELoss()), cuda=opt.cuda, device=opt.device) saverG = ModelSaver(f'{opt.checkpoint_dir}/saved_models/{opt.name}') train_writer = SummaryWriter(log_dir=os.path.join(opt.checkpoint_dir, 'train')) test_writer = SummaryWriter(log_dir=os.path.join(opt.checkpoint_dir, 'test')) for epoch in tqdm(range(opt.epoch, opt.n_epochs), desc='Training'): #### # Train ### avg_stats = defaultdict(float) for i, data in enumerate(train_loader): data = to_variables(data, cuda=opt.cuda, device=opt.device) y_hat, loss_G = trainG(generator, criterion_pixelwise, optimizer_G, data) update_stats(avg_stats, loss_G) # Print image to tensorboard if (epoch % opt.sample_interval == 0) and (i % 50 == 0): train_writer.add_image('RetouchNet', y_hat[0], epoch) train_writer.add_image('Edited', data[2][0], epoch) train_writer.add_image('Original', data[0][0], epoch) # Log Progress str_out = '[train] {}/{} '.format(epoch, opt.n_epochs) for k, v in avg_stats.items(): avg = v / len(train_loader) train_writer.add_scalar(k, avg, epoch) str_out += '{}: {:.6f} '.format(k, avg) print(str_out) #### # Test ### avg_stats = defaultdict(float) images = None with torch.no_grad(): for i, data in enumerate(test_loader): data = to_variables(data, cuda=opt.cuda, device=opt.device, test=True) images, losses = test(generator, criterion_pixelwise, data) update_stats(avg_stats, losses) # Print image to tensorboard if (epoch % opt.sample_interval == 0) and (i % 5 == 0): test_writer.add_image('RetouchNet', images[0], epoch) test_writer.add_image('Edited', data[2][0], epoch) test_writer.add_image('Original', data[0][0], epoch) # Log Progress str_out = '[test] {}/{} '.format(epoch, opt.n_epochs) for k, v in avg_stats.items(): avg = v / len(test_loader) test_writer.add_scalar(k, avg, epoch) str_out += '{}: {:.6f} '.format(k, avg) print(str_out) # If at sample interval save image # if epoch % opt.sample_interval == 0: # x_hr, x_lr, y_hr, y_lr = data # test_writer.add_image('RetouchNet', images[0], epoch) # test_writer.add_image('GroundTruth', y_hr[0], epoch) # test_writer.add_image('raw', x_hr[0], epoch) if epoch % opt.checkpoint_interval == 0: # Save model checkpoints saverG.save_if_best(generator, loss_G['loss_G'])
optimizer = optim.Adam( model.parameters(), lr=CONFIG.hyperparam.optimization.lr, betas=( CONFIG.hyperparam.optimization.beta1, CONFIG.hyperparam.optimization.beta2, ), weight_decay=CONFIG.hyperparam.optimization.weight_decay, ) logging.info("done!") ######################################################### ################# load model params ###################### logging.info("loading model params...") model_path = os.path.join(outdir, "best_score.ckpt") saver = ModelSaver(model_path) offset_ep = saver.load_ckpt(model, optimizer, device) if offset_ep == 1: raise RuntimeError("aborting, no pretrained model") logging.info("done!") ########################################################## if torch.cuda.device_count() > 1: model = nn.DataParallel(model) ################# make submission file ###################### logging.info("making submission file...") submission = { "version": "VERSION 1.3", "external_data": { "used":
def main(): logging = get_root_logger(args.log_path, mode='a') logging.info('Command Line Arguments:') for key, i in vars(args).items(): logging.info(key + ' = ' + str(i)) logging.info('End Command Line Arguments') batch_size = args.batch_size num_epochs = args.num_epochs resume_from = args.resume_from steps_per_checkpoint = args.steps_per_checkpoint gpu_id = args.gpu_id configure_process(args, gpu_id) if gpu_id > -1: logging.info('Using CUDA on GPU ' + str(gpu_id)) args.cuda = True else: logging.info('Using CPU') args.cuda = False '''Load data''' logging.info('Data base dir ' + args.data_base_dir) logging.info('Loading vocab from ' + args.vocab_file) with open(args.vocab_file, "r", encoding='utf-8') as f: args.target_vocab_size = len(f.readlines()) + 4 logging.info('Load training data from ' + args.data_path) train_data = UIDataset(args.data_base_dir, args.data_path, args.label_path, args.vocab_file) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2, drop_last=True, collate_fn=collate_fn) logging.info('Load validation data from ' + args.val_data_path) val_data = UIDataset(args.data_base_dir, args.val_data_path, args.label_path, args.vocab_file) val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=2, drop_last=True, collate_fn=collate_fn) # Build model logging.info('Building model') if args.resume_from: logging.info('Loading checkpoint from %s' % resume_from) checkpoint = torch.load(resume_from) else: checkpoint = None logging.info('Creating model with fresh parameters') model = build_model(args, gpu_id, checkpoint) logging.info(model) n_params, enc, dec = cal_parameters(model) logging.info('encoder: %d' % enc) logging.info('decoder: %d' % dec) logging.info('number of parameters: %d' % n_params) # Build optimizer optimier = torch.optim.SGD(model.parameters(), lr=args.learning_rate) optim = Optimizer(optimier) if checkpoint: optim.load_state_dict(checkpoint['optim']) optim.training_step += 1 # Build model saver model_saver = ModelSaver(args.model_dir, model, optim) train(model, optim, model_saver, num_epochs, train_loader, val_loader, steps_per_checkpoint, args.valid_steps, args.lr_decay, args.start_decay_at, args.cuda)
obs_size = 2 * player_size + hand_size + hand_size + 32 num_actions = 4 hidden_1 = 256 hidden_2 = 64 # Actor maps state to actions' probabilities actor = nn.Sequential(nn.Linear(obs_size, hidden_1), nn.ReLU(), nn.Linear(hidden_1, hidden_2), nn.ReLU(), nn.Linear(hidden_2, num_actions), nn.Softmax(dim=1)) optimizer = optim.Adam(actor.parameters(), lr=0.01) discounting = 0.99999 saver = ModelSaver({ 'actor': actor, 'optim_actor': optimizer }, './models/Sedma/VPG-3') # saver.load() # saver.load(ignore_errors=True) def card2tensor(c): suit, rank = c suit = suit2ix[suit] rank = rank2ix[rank] return torch.cat((one_hot(suit, len(suits)), one_hot(rank, len(ranks)))) def state2tensor(state):