def build_optimizer(self, named_params): no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{'params': [p for n, p in named_params if not any(nd in n for nd in no_decay)], 'weight_decay': self.opt.weight_decay}, {'params': [p for n, p in named_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] fused_adam = FusedAdam(optimizer_grouped_parameters, lr=self.opt.learning_rate, bias_correction=False, max_grad_norm=self.opt.clip) #params = [p for n, p in named_params if p.requires_grad] #fused_adam = BertAdam(params, lr=self.opt.learning_rate, max_grad_norm=self.opt.clip, weight_decay=self.opt.weight_decay) return FP16_Optimizer(fused_adam, dynamic_loss_scale=True)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) torch.nn.functional.sigmoid model = load_model(hparams) learning_rate = hparams.learning_rate #lr = args.lr * (0.1 ** (epoch // 30)) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, dampening=0, weight_decay=hparams.weight_decay) if hparams.fp16_run: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling) if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(hparams) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() # ================ MAIN TRAINNIG LOOP! =================== step = 0 for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss, recon_loss, S_kl_loss, R_kl_loss, speaker_loss, augment_loss, alignment_loss = criterion( y_pred, y, iteration) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() overflow = optimizer.overflow if hparams.fp16_run else False if not overflow and not math.isnan(reduced_loss) and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, recon_loss, S_kl_loss, R_kl_loss, \ speaker_loss, augment_loss, alignment_loss, iteration) if not overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) print("Loading models...") model = load_model(hparams) print("Initializing optimizer...") learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling) criterion = Tacotron2Loss() print("Initializing logger...") logger = prepare_directories_and_logger(output_directory, log_directory, rank) print("Initializing dataloader...") train_loader, valset, collate_fn = prepare_dataloaders(hparams) print("Loading checkpoints...") # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() if hparams.distributed_run or torch.cuda.device_count() > 1: batch_parser = model.module.parse_batch else: batch_parser = model.parse_batch # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() x, y = batch_parser(batch) y_pred = model(x) loss = criterion(y_pred, y) reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \ if hparams.distributed_run else loss.data[0] if hparams.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm( model.parameters(), hparams.grad_clip_thresh) optimizer.step() overflow = optimizer.overflow if hparams.fp16_run else False if not overflow and not math.isnan(reduced_loss) and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if not overflow and (iteration % hparams.iters_per_checkpoint == 0): reduced_val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, reduced_val_loss)) checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) logger.log_validation(reduced_val_loss, model, x, y, y_pred, iteration, hparams) iteration += 1
def train(cfg, local_rank, distributed, random_number_generator=None): if (torch._C, '_jit_set_profiling_executor') : torch._C._jit_set_profiling_executor(False) if (torch._C, '_jit_set_profiling_mode') : torch._C._jit_set_profiling_mode(False) # Model logging log_event(key=constants.GLOBAL_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) log_event(key=constants.NUM_IMAGE_CANDIDATES, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) # Initialize mixed-precision training is_fp16 = (cfg.DTYPE == "float16") if is_fp16: # convert model to FP16 model.half() optimizer = make_optimizer(cfg, model) # Optimizer logging log_event(key=constants.OPT_NAME, value="sgd_with_momentum") log_event(key=constants.OPT_BASE_LR, value=cfg.SOLVER.BASE_LR) log_event(key=constants.OPT_LR_WARMUP_STEPS, value=cfg.SOLVER.WARMUP_ITERS) log_event(key=constants.OPT_LR_WARMUP_FACTOR, value=cfg.SOLVER.WARMUP_FACTOR) log_event(key=constants.OPT_LR_DECAY_FACTOR, value=cfg.SOLVER.GAMMA) log_event(key=constants.OPT_LR_DECAY_STEPS, value=cfg.SOLVER.STEPS) log_event(key=constants.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN[0]) log_event(key=constants.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) scheduler = make_lr_scheduler(cfg, optimizer) # disable the garbage collection gc.disable() if distributed: model = DDP(model, delay_allreduce=True) arguments = {} arguments["iteration"] = 0 arguments["nhwc"] = cfg.NHWC output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, cfg.NHWC) arguments.update(extra_checkpoint_data) if is_fp16: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) log_end(key=constants.INIT_STOP) barrier() log_start(key=constants.RUN_START) barrier() data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], random_number_generator=random_number_generator, ) log_event(key=constants.TRAIN_SAMPLES, value=len(data_loader)) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if cfg.PER_EPOCH_EVAL: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP, min_segm_map=cfg.MLPERF.MIN_SEGM_MAP) else: per_iter_callback_fn = None start_train_time = time.time() success = do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, cfg.DISABLE_REDUCED_LOGGING, per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch), per_iter_end_callback_fn=per_iter_callback_fn, ) end_train_time = time.time() total_training_time = end_train_time - start_train_time print( "&&&& MLPERF METRIC THROUGHPUT={:.4f} iterations / s".format((arguments["iteration"] * cfg.SOLVER.IMS_PER_BATCH) / total_training_time) ) return model, success
def style(model, style_image, content_image, iterations): n_iter = [0] t0 = time.time() style_image = prep(style_image, False) content_image = prep(content_image) targets = compute_targets(model, style_image, content_image) loss_layers = model.style_layers + model.content_layers if args.optimizer.lower() == 'adam': optimizer = optim.Adam([content_image], lr=args.lr, eps=args.eps, betas=(args.beta1, 0.999)) elif args.optimizer.lower() == 'sgd': optimizer = torch.optim.SGD([content_image], lr=args.lr, momentum=0.99999) else: optimizer = optim.LBFGS([content_image], lr=args.lr, history_size=args.history_size, tolerance_grad=args.tolerance_grad, tolerance_change=args.tolerance_change, max_iter=args.max_iter, max_eval=args.max_eval) if args.half: optimizer = FP16_Optimizer(optimizer, scale=args.static_loss_scale, dynamic_scale=args.dynamic_loss_scale) def closure(): optimizer.zero_grad() out = model(content_image, loss_layers) layer_losses = [ weights[a] * loss_fns[a](A.float(), targets[a]) for a, A in enumerate(out) ] loss = sum(layer_losses) if args.half: optimizer.backward(loss) else: loss.backward() # print(content_image.grad.data) # print(optimizer.fp32_params[0].grad.data) # quit() n_iter[0] += 1 if n_iter[0] % args.log_interval == 1: print('Iteration: %d, loss: %d time : %s' % (n_iter[0], int(loss.data[0]), time.time() - t0)) # print([loss_layers[li] + ': ' + str(l.data[0]) for li,l in enumerate(layer_losses)]) #loss of each layer return loss while n_iter[0] <= iterations: optimizer.step(closure) if args.save_interval > 0 and n_iter[ 0] % args.save_interval == 0 and n_iter[0] > 0: postp(content_image.data[0].float().cpu().squeeze()).save( outfile + '-i' + str(n_iter[0]), format='JPEG', subsampling=0, quality=60) return postp(content_image.data[0].float().cpu().squeeze())
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hyper_params, train_loader, valset, collate_fn): """Training and validation method with logging results to tensorboard and stdout :param output_directory (string): directory to save checkpoints :param log_directory (string): directory to save tensorboard logs :param checkpoint_path (string): checkpoint path :param n_gpus (int): number of gpus :param rank (int): rank of current gpu :param hyper_params (object dictionary): dictionary with all hyper parameters """ # Check whether is a distributed running if hyper_params['distributed_run']: init_distributed(hyper_params, n_gpus, rank, group_name) # set the same fixed seed to reproduce same results everytime we train torch.manual_seed(hyper_params['seed']) torch.cuda.manual_seed(hyper_params['seed']) model = load_model(hyper_params) learning_rate = hyper_params['learning_rate'] optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hyper_params['weight_decay']) if hyper_params['fp16_run']: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=hyper_params['dynamic_loss_scaling']) # Define the criterion of the loss function. The objective. criterion = Tacotron2Loss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) # logger = '' iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: # Re-start the model from the last checkpoint if we save the parameters and don't want to start from 0 model = warm_start_model(checkpoint_path, model) else: # CHECK THIS OUT!!! model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hyper_params['use_saved_learning_rate']: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) # Set this to make all modules and regularization aware this is the training stage: model.train() # MAIN LOOP for epoch in range(epoch_offset, hyper_params['epochs']): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() # CHECK THIS OUT!!! for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.zero_grad() input_data, output_target = model.parse_batch(batch) output_predicted = model(input_data) loss = criterion(output_predicted, output_target) if hyper_params['distributed_run']: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if hyper_params['fp16_run']: optimizer.backward( loss) # transformed optimizer into fp16 type grad_norm = optimizer.clip_fp32_grads( hyper_params['grad_clip_thresh']) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hyper_params['grad_clip_thresh']) # Performs a single optimization step (parameter update) optimizer.step() # This boolean controls overflow when running in fp16 optimizer overflow = optimizer.overflow if hyper_params['fp16_run'] else False # If overflow is True, it will not enter. If isnan is True, it will not enter neither. if not overflow and not math.isnan(reduced_loss) and rank == 0: duration = time.perf_counter() - start print( "Train loss {} {:.6f} Grand Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) # logs training information of the current iteration logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) # Every iters_per_checkpoint steps there is a validation of the model and its updated parameters if not overflow and (iteration % hyper_params['iters_per_checkpoint'] == 0): validate(model, criterion, valset, iteration, hyper_params['batch_size'], n_gpus, collate_fn, logger, hyper_params['distributed_run'], rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1