Exemplo n.º 1
0
def main():
    if not os.path.isdir(cfg.CKPT):
        mkdir_p(cfg.CKPT)
    if args.cfg_file is not None:
        shutil.copyfile(args.cfg_file, os.path.join(cfg.CKPT, args.cfg_file.split('/')[-1]))
    assert_and_infer_cfg(make_immutable=False)

    # Create model
    model = Generalized_RCNN()
    logging_rank(model, distributed=args.distributed, local_rank=args.local_rank)

    # Create checkpointer
    checkpointer = CheckPointer(cfg.CKPT, weights_path=cfg.TRAIN.WEIGHTS, auto_resume=cfg.TRAIN.AUTO_RESUME,
                                local_rank=args.local_rank)

    # Load model or random-initialization
    model = checkpointer.load_model(model, convert_conv1=cfg.MODEL.CONV1_RGB2BGR)
    if cfg.MODEL.BATCH_NORM == 'freeze':
        model = convert_bn2affine_model(model, merge=not checkpointer.resume)
    elif cfg.MODEL.BATCH_NORM == 'sync':
        model = convert_bn2syncbn_model(model)
    model.to(args.device)

    # Create optimizer
    optimizer = Optimizer(model, cfg.SOLVER, local_rank=args.local_rank).build()
    optimizer = checkpointer.load_optimizer(optimizer)
    logging_rank('The mismatch keys: {}'.format(mismatch_params_filter(sorted(checkpointer.mismatch_keys))),
                 distributed=args.distributed, local_rank=args.local_rank)

    # Create scheduler
    scheduler = LearningRateScheduler(optimizer, cfg.SOLVER, start_iter=0, local_rank=args.local_rank)
    scheduler = checkpointer.load_scheduler(scheduler)

    # Create training dataset and loader
    datasets = build_dataset(cfg.TRAIN.DATASETS, is_train=True, local_rank=args.local_rank)
    train_loader = make_train_data_loader(datasets, is_distributed=args.distributed, start_iter=scheduler.iteration)

    # Model Distributed
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank,
        )
    else:
        model = torch.nn.DataParallel(model)

    # Build hooks
    all_hooks = build_train_hooks(cfg, optimizer, scheduler, max_iter=cfg.SOLVER.MAX_ITER,
                                  warmup_iter=cfg.SOLVER.WARM_UP_ITERS, ignore_warmup_time=False)

    # Train
    train(model, train_loader, optimizer, scheduler, checkpointer, all_hooks)
Exemplo n.º 2
0
def train(args):

    os.makedirs(args.checkpoint_dir, exist_ok=True)
    logging = GetLogging(args.logfile)

    train_dataset = CustomerDataset(args.input,
                                    upsample_factor=hop_length,
                                    local_condition=True,
                                    global_condition=False)

    device = torch.device("cuda" if args.use_cuda else "cpu")
    generator, discriminator = create_model(args)

    print(generator)
    print(discriminator)

    num_gpu = torch.cuda.device_count() if args.use_cuda else 1

    global_step = 0

    g_parameters = list(generator.parameters())
    g_optimizer = optim.Adam(g_parameters, lr=args.g_learning_rate)

    d_parameters = list(discriminator.parameters())
    d_optimizer = optim.Adam(d_parameters, lr=args.d_learning_rate)

    writer = Writer(args.checkpoint_dir, sample_rate=sample_rate)

    generator.to(device)
    discriminator.to(device)

    if args.resume is not None:
        restore_step = attempt_to_restore(generator, discriminator,
                                          g_optimizer, d_optimizer,
                                          args.resume, args.use_cuda, logging)
        global_step = restore_step

    customer_g_optimizer = Optimizer(g_optimizer, args.g_learning_rate,
                                     global_step, args.warmup_steps,
                                     args.decay_learning_rate)
    customer_d_optimizer = Optimizer(d_optimizer, args.d_learning_rate,
                                     global_step, args.warmup_steps,
                                     args.decay_learning_rate)

    criterion = nn.MSELoss().to(device)
    stft_criterion = MultiResolutionSTFTLoss()

    for epoch in range(args.epochs):

        collate = CustomerCollate(upsample_factor=hop_length,
                                  condition_window=args.condition_window,
                                  local_condition=True,
                                  global_condition=False)

        train_data_loader = DataLoader(train_dataset,
                                       collate_fn=collate,
                                       batch_size=args.batch_size,
                                       num_workers=args.num_workers,
                                       shuffle=True,
                                       pin_memory=True)

        #train one epoch
        for batch, (samples, conditions) in enumerate(train_data_loader):

            start = time.time()
            batch_size = int(conditions.shape[0] // num_gpu * num_gpu)

            samples = samples[:batch_size, :].to(device)
            conditions = conditions[:batch_size, :, :].to(device)

            losses = {}

            if num_gpu > 1:
                g_outputs = parallel(generator, (conditions, ))
            else:
                g_outputs = generator(conditions)

            sc_loss, mag_loss = stft_criterion(g_outputs.squeeze(1),
                                               samples.squeeze(1))

            g_loss = sc_loss + mag_loss

            losses['sc_loss'] = sc_loss.item()
            losses['mag_loss'] = mag_loss.item()
            losses['g_loss'] = g_loss.item()

            customer_g_optimizer.zero_grad()
            g_loss.backward()
            nn.utils.clip_grad_norm_(g_parameters, max_norm=0.5)
            customer_g_optimizer.step_and_update_lr()

            time_used = time.time() - start

            logging.info(
                "Step: {} --sc_loss: {:.3f} --mag_loss: {:.3f} --Time: {:.2f} seconds"
                .format(global_step, sc_loss, mag_loss, time_used))

            if global_step % args.checkpoint_step == 0:
                save_checkpoint(args, generator, discriminator, g_optimizer,
                                d_optimizer, global_step, logging)

            if global_step % args.summary_step == 0:
                writer.logging_loss(losses, global_step)
                target = samples.cpu().detach()[0, 0].numpy()
                predict = g_outputs.cpu().detach()[0, 0].numpy()
                writer.logging_audio(target, predict, global_step)
                writer.logging_histogram(generator, global_step)
                writer.logging_histogram(discriminator, global_step)

            global_step += 1
Exemplo n.º 3
0
def train(args):

    os.makedirs(args.checkpoint_dir, exist_ok=True)
    os.makedirs(args.ema_checkpoint_dir, exist_ok=True)

    train_dataset = CustomerDataset(args.input,
                                    upsample_factor=hop_length,
                                    local_condition=True,
                                    global_condition=False)

    device = torch.device("cuda" if args.use_cuda else "cpu")
    generator, discriminator = create_model(args)

    print(generator)
    print(discriminator)

    num_gpu = torch.cuda.device_count() if args.use_cuda else 1

    global_step = 0

    g_parameters = list(generator.parameters())
    g_optimizer = optim.Adam(g_parameters, lr=args.g_learning_rate)

    d_parameters = list(discriminator.parameters())
    d_optimizer = optim.Adam(d_parameters, lr=args.d_learning_rate)

    writer = SummaryWriter(args.checkpoint_dir)

    generator.to(device)
    discriminator.to(device)

    if args.resume is not None:
        restore_step = attempt_to_restore(generator, discriminator,
                                          g_optimizer, d_optimizer,
                                          args.resume, args.use_cuda)
        global_step = restore_step

    ema = ExponentialMovingAverage(args.ema_decay)
    register_model_to_ema(generator, ema)

    customer_g_optimizer = Optimizer(g_optimizer, args.g_learning_rate,
                                     global_step, args.warmup_steps,
                                     args.decay_learning_rate)
    customer_d_optimizer = Optimizer(d_optimizer, args.d_learning_rate,
                                     global_step, args.warmup_steps,
                                     args.decay_learning_rate)

    criterion = nn.MSELoss().to(device)

    for epoch in range(args.epochs):

        collate = CustomerCollate(upsample_factor=hop_length,
                                  condition_window=args.condition_window,
                                  local_condition=True,
                                  global_condition=False)

        train_data_loader = DataLoader(train_dataset,
                                       collate_fn=collate,
                                       batch_size=args.batch_size,
                                       num_workers=args.num_workers,
                                       shuffle=True,
                                       pin_memory=True)

        #train one epoch
        for batch, (samples, conditions) in enumerate(train_data_loader):

            start = time.time()
            batch_size = int(conditions.shape[0] // num_gpu * num_gpu)

            samples = samples[:batch_size, :].to(device)
            conditions = conditions[:batch_size, :, :].to(device)
            z = torch.randn(batch_size, args.z_dim).to(device)

            #train generator
            if num_gpu > 1:
                g_outputs = parallel(generator, (conditions, z))
                _, fake_outputs, real_features, fake_features = \
                   parallel(discriminator, (samples, g_outputs, conditions))
            else:
                g_outputs = generator(conditions, z)
                _, fake_outputs, real_features, fake_features = \
                   discriminator(samples, g_outputs, conditions)

            g_d_loss = []
            for fake_output in fake_outputs:
                target = torch.ones_like(fake_output).to(device)
                g_d_loss.append(criterion(fake_output, target))
            feature_loss = feature_loss_calculate(real_features, fake_features)
            g_loss = feature_loss * args.lamda + sum(g_d_loss)

            customer_g_optimizer.zero_grad()
            g_loss.backward()
            nn.utils.clip_grad_norm_(g_parameters, max_norm=0.5)
            customer_g_optimizer.step_and_update_lr()

            #train discriminator
            g_outputs = g_outputs.detach()
            if num_gpu > 1:
                real_outputs, fake_outputs, _, _ = \
                    parallel(discriminator, (samples, g_outputs, conditions))
            else:
                real_outputs, fake_outputs, _, _ = \
                    discriminator(samples, g_outputs, conditions)

            fake_loss, real_loss = [], []
            for (fake_output, real_output) in zip(fake_outputs, real_outputs):
                fake_target = torch.zeros_like(fake_output).to(device)
                real_target = torch.ones_like(real_output).to(device)
                fake_loss.append(criterion(fake_output, fake_target))
                real_loss.append(criterion(real_output, real_target))
            d_loss = sum(fake_loss) + sum(real_loss)

            customer_d_optimizer.zero_grad()
            d_loss.backward()
            nn.utils.clip_grad_norm_(d_parameters, max_norm=0.5)
            customer_d_optimizer.step_and_update_lr()

            global_step += 1

            print(
                "Step: {} --g_loss: {:.3f} --d_loss: {:.3f} --Time: {:.2f} seconds"
                .format(global_step, g_loss, d_loss,
                        float(time.time() - start)))
            print(feature_loss.item(), sum(g_d_loss).item(), d_loss.item())
            if ema is not None:
                apply_moving_average(generator, ema)

            if global_step % args.checkpoint_step == 0:
                save_checkpoint(args, generator, discriminator, g_optimizer,
                                d_optimizer, global_step, ema)

            if global_step % args.summary_step == 0:
                writer.add_scalar("g_loss", g_loss.item(), global_step)
                writer.add_scalar("d_loss", d_loss.item(), global_step)
Exemplo n.º 4
0
def train(args):

    os.makedirs(args.checkpoint_dir, exist_ok=True)

    train_dataset = CustomerDataset(
         args.input,
         upsample_factor=hop_length,
         local_condition=True,
         global_condition=False)

    device = torch.device("cuda" if args.use_cuda else "cpu")
    generator, discriminator = create_model(args)

    print(generator)
    print(discriminator)

    num_gpu = torch.cuda.device_count() if args.use_cuda else 1

    global_step = 0

    g_parameters = list(generator.parameters())
    g_optimizer = optim.Adam(g_parameters, lr=args.g_learning_rate)

    d_parameters = list(discriminator.parameters())
    d_optimizer = optim.Adam(d_parameters, lr=args.d_learning_rate)
    
    writer = SummaryWriter(args.checkpoint_dir)

    generator.to(device)
    discriminator.to(device)

    if args.resume is not None:
        restore_step = attempt_to_restore(generator, discriminator, g_optimizer,
                                          d_optimizer, args.resume, args.use_cuda)
        global_step = restore_step

    customer_g_optimizer = Optimizer(g_optimizer, args.g_learning_rate,
                global_step, args.warmup_steps, args.decay_learning_rate)
    customer_d_optimizer = Optimizer(d_optimizer, args.d_learning_rate,
                global_step, args.warmup_steps, args.decay_learning_rate)

    stft_criterion = MultiResolutionSTFTLoss().to(device)
    criterion = nn.MSELoss().to(device)

    for epoch in range(args.epochs):

       collate = CustomerCollate(
           upsample_factor=hop_length,
           condition_window=args.condition_window,
           local_condition=True,
           global_condition=False)

       train_data_loader = DataLoader(train_dataset, collate_fn=collate,
               batch_size=args.batch_size, num_workers=args.num_workers,
               shuffle=True, pin_memory=True)

       #train one epoch
       for batch, (samples, conditions) in enumerate(train_data_loader):

            start = time.time()
            batch_size = int(conditions.shape[0] // num_gpu * num_gpu)

            samples = samples[:batch_size, :].to(device)
            conditions = conditions[:batch_size, :, :].to(device)
            z = torch.randn(batch_size, args.z_dim).to(device)

            losses = {}

            if num_gpu > 1:
                g_outputs = parallel(generator, (conditions, z))
            else:
                g_outputs = generator(conditions, z)

            #train discriminator
            if global_step > args.discriminator_train_start_steps:
                if num_gpu > 1:
                    real_outputs, fake_outputs = \
                        parallel(discriminator, (samples, g_outputs.detach(), conditions))
                else:
                    real_outputs, fake_outputs = \
                        discriminator(samples, g_outputs.detach(), conditions)

                fake_loss, real_loss = [], []
                for (fake_output, real_output) in zip(fake_outputs, real_outputs):
                    fake_loss.append(criterion(fake_output, torch.zeros_like(fake_output)))
                    real_loss.append(criterion(real_output, torch.ones_like(real_output)))
                #fake_loss = sum(fake_loss) / 10.0
                #real_loss = sum(real_loss) / 10.0
                fake_loss = sum(fake_loss)
                real_loss = sum(real_loss)

                d_loss = fake_loss + real_loss

                customer_d_optimizer.zero_grad()
                d_loss.backward()
                nn.utils.clip_grad_norm_(d_parameters, max_norm=0.5)
                customer_d_optimizer.step_and_update_lr()
            else:
                d_loss = torch.Tensor([0])
                fake_loss = torch.Tensor([0])
                real_loss = torch.Tensor([0])

            losses['fake_loss'] = fake_loss.item()
            losses['real_loss'] = real_loss.item()
            losses['d_loss'] = d_loss.item()

            #train generator
            if num_gpu > 1:
                _, fake_outputs = parallel(discriminator, (samples, g_outputs, conditions))
            else:
                _, fake_outputs = discriminator(samples, g_outputs, conditions)

            adv_loss = []
            for fake_output in fake_outputs:
               adv_loss.append(criterion(fake_output, torch.ones_like(fake_output)))

            #adv_loss = sum(adv_loss) / 10.0
            adv_loss = sum(adv_loss)

            sc_loss, mag_loss = stft_criterion(g_outputs.squeeze(1), samples.squeeze(1))

            if global_step > args.discriminator_train_start_steps:
               g_loss = adv_loss * args.lamda_adv + sc_loss + mag_loss 
            else:
               g_loss = sc_loss + mag_loss

            losses['adv_loss'] = adv_loss.item()
            losses['sc_loss'] = sc_loss
            losses['mag_loss'] = mag_loss
            losses['g_loss'] = g_loss.item()
 
            customer_g_optimizer.zero_grad()
            g_loss.backward()
            nn.utils.clip_grad_norm_(g_parameters, max_norm=0.5)
            customer_g_optimizer.step_and_update_lr()

            time_used = time.time() - start
            if global_step > args.discriminator_train_start_steps:
                print("Step: {} --adv_loss: {:.3f} --real_loss: {:.3f} --fake_loss: {:.3f} --sc_loss: {:.3f} --mag_loss: {:.3f} --Time: {:.2f} seconds".format(
                   global_step, adv_loss, real_loss, fake_loss, sc_loss, mag_loss, time_used))
            else:
                print("Step: {} --sc_loss: {:.3f} --mag_loss: {:.3f} --Time: {:.2f} seconds".format(global_step, sc_loss, mag_loss, time_used))

            global_step += 1

            if global_step % args.checkpoint_step == 0:
                save_checkpoint(args, generator, discriminator,
                         g_optimizer, d_optimizer, global_step)
                
            if global_step % args.summary_step == 0:
                for key in losses:
                    writer.add_scalar('{}'.format(key), losses[key], global_step)
Exemplo n.º 5
0
    def __init__(self,
                 batch_size=32,
                 optimizer_name="Adam",
                 lr=1e-3,
                 weight_decay=1e-5,
                 epochs=200,
                 model_name="model01",
                 gpu_ids=None,
                 resume=None,
                 tqdm=None,
                 is_develop=False):
        """
        args:
            batch_size = (int) batch_size of training and validation
            lr = (float) learning rate of optimization
            weight_decay = (float) weight decay of optimization
            epochs = (int) The number of epochs of training
            model_name = (string) The name of training model. Will be folder name.
            gpu_ids = (List) List of gpu_ids. (e.g. gpu_ids = [0, 1]). Use CPU, if it is None. 
            resume = (Dict) Dict of some settings. (resume = {"checkpoint_path":PATH_of_checkpoint, "fine_tuning":True or False}). 
                     Learn from scratch, if it is None.
            tqdm = (tqdm Object) progress bar object. Set your tqdm please.
                   Don't view progress bar, if it is None.
        """
        # Set params
        self.batch_size = batch_size
        self.epochs = epochs
        self.start_epoch = 0
        self.use_cuda = (gpu_ids is not None) and torch.cuda.is_available
        self.tqdm = tqdm
        self.use_tqdm = tqdm is not None
        # Define Utils. (No need to Change.)
        """
        These are Project Modules.
        You may not have to change these.
        
        Saver: Save model weight. / <utils.saver.Saver()>
        TensorboardSummary: Write tensorboard file. / <utils.summaries.TensorboardSummary()>
        Evaluator: Calculate some metrics (e.g. Accuracy). / <utils.metrics.Evaluator()>
        """
        ## ***Define Saver***
        self.saver = Saver(model_name, lr, epochs)
        self.saver.save_experiment_config()

        ## ***Define Tensorboard Summary***
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        # ------------------------- #
        # Define Training components. (You have to Change!)
        """
        These are important setting for training.
        You have to change these.
        
        make_data_loader: This creates some <Dataloader>s. / <dataloader.__init__>
        Modeling: You have to define your Model. / <modeling.modeling.Modeling()>
        Evaluator: You have to define Evaluator. / <utils.metrics.Evaluator()>
        Optimizer: You have to define Optimizer. / <utils.optimizer.Optimizer()>
        Loss: You have to define Loss function. / <utils.loss.Loss()>
        """
        ## ***Define Dataloader***
        self.train_loader, self.val_loader, self.test_loader, self.num_classes = make_data_loader(
            batch_size, is_develop=is_develop)

        ## ***Define Your Model***
        self.model = Modeling(self.num_classes)

        ## ***Define Evaluator***
        self.evaluator = Evaluator(self.num_classes)

        ## ***Define Optimizer***
        self.optimizer = Optimizer(self.model.parameters(),
                                   optimizer_name=optimizer_name,
                                   lr=lr,
                                   weight_decay=weight_decay)

        ## ***Define Loss***
        self.criterion = SegmentationLosses(
            weight=torch.tensor([1.0, 1594.0]).cuda()).build_loss('ce')
        # self.criterion = SegmentationLosses().build_loss('focal')
        #  self.criterion = BCEDiceLoss()
        # ------------------------- #
        # Some settings
        """
        You don't have to touch bellow code.
        
        Using cuda: Enable to use cuda if you want.
        Resuming checkpoint: You can resume training if you want.
        """
        ## ***Using cuda***
        if self.use_cuda:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=gpu_ids).cuda()

        ## ***Resuming checkpoint***
        """You can ignore bellow code."""
        self.best_pred = 0.0
        if resume is not None:
            if not os.path.isfile(resume["checkpoint_path"]):
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    resume["checkpoint_path"]))
            checkpoint = torch.load(resume["checkpoint_path"])
            self.start_epoch = checkpoint['epoch']
            if self.use_cuda:
                self.model.module.load_state_dict(checkpoint['state_dict'])
            else:
                self.model.load_state_dict(checkpoint['state_dict'])
            if resume["fine_tuning"]:
                # resume params of optimizer, if run fine tuning.
                self.optimizer.load_state_dict(checkpoint['optimizer'])
                self.start_epoch = 0
            self.best_pred = checkpoint['best_pred']
            print("=> loaded checkpoint '{}' (epoch {})".format(
                resume["checkpoint_path"], checkpoint['epoch']))
Exemplo n.º 6
0
class Trainer(object):
    def __init__(self,
                 batch_size=32,
                 optimizer_name="Adam",
                 lr=1e-3,
                 weight_decay=1e-5,
                 epochs=200,
                 model_name="model01",
                 gpu_ids=None,
                 resume=None,
                 tqdm=None,
                 is_develop=False):
        """
        args:
            batch_size = (int) batch_size of training and validation
            lr = (float) learning rate of optimization
            weight_decay = (float) weight decay of optimization
            epochs = (int) The number of epochs of training
            model_name = (string) The name of training model. Will be folder name.
            gpu_ids = (List) List of gpu_ids. (e.g. gpu_ids = [0, 1]). Use CPU, if it is None. 
            resume = (Dict) Dict of some settings. (resume = {"checkpoint_path":PATH_of_checkpoint, "fine_tuning":True or False}). 
                     Learn from scratch, if it is None.
            tqdm = (tqdm Object) progress bar object. Set your tqdm please.
                   Don't view progress bar, if it is None.
        """
        # Set params
        self.batch_size = batch_size
        self.epochs = epochs
        self.start_epoch = 0
        self.use_cuda = (gpu_ids is not None) and torch.cuda.is_available
        self.tqdm = tqdm
        self.use_tqdm = tqdm is not None
        # Define Utils. (No need to Change.)
        """
        These are Project Modules.
        You may not have to change these.
        
        Saver: Save model weight. / <utils.saver.Saver()>
        TensorboardSummary: Write tensorboard file. / <utils.summaries.TensorboardSummary()>
        Evaluator: Calculate some metrics (e.g. Accuracy). / <utils.metrics.Evaluator()>
        """
        ## ***Define Saver***
        self.saver = Saver(model_name, lr, epochs)
        self.saver.save_experiment_config()

        ## ***Define Tensorboard Summary***
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        # ------------------------- #
        # Define Training components. (You have to Change!)
        """
        These are important setting for training.
        You have to change these.
        
        make_data_loader: This creates some <Dataloader>s. / <dataloader.__init__>
        Modeling: You have to define your Model. / <modeling.modeling.Modeling()>
        Evaluator: You have to define Evaluator. / <utils.metrics.Evaluator()>
        Optimizer: You have to define Optimizer. / <utils.optimizer.Optimizer()>
        Loss: You have to define Loss function. / <utils.loss.Loss()>
        """
        ## ***Define Dataloader***
        self.train_loader, self.val_loader, self.test_loader, self.num_classes = make_data_loader(
            batch_size, is_develop=is_develop)

        ## ***Define Your Model***
        self.model = Modeling(self.num_classes)

        ## ***Define Evaluator***
        self.evaluator = Evaluator(self.num_classes)

        ## ***Define Optimizer***
        self.optimizer = Optimizer(self.model.parameters(),
                                   optimizer_name=optimizer_name,
                                   lr=lr,
                                   weight_decay=weight_decay)

        ## ***Define Loss***
        self.criterion = SegmentationLosses(
            weight=torch.tensor([1.0, 1594.0]).cuda()).build_loss('ce')
        # self.criterion = SegmentationLosses().build_loss('focal')
        #  self.criterion = BCEDiceLoss()
        # ------------------------- #
        # Some settings
        """
        You don't have to touch bellow code.
        
        Using cuda: Enable to use cuda if you want.
        Resuming checkpoint: You can resume training if you want.
        """
        ## ***Using cuda***
        if self.use_cuda:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=gpu_ids).cuda()

        ## ***Resuming checkpoint***
        """You can ignore bellow code."""
        self.best_pred = 0.0
        if resume is not None:
            if not os.path.isfile(resume["checkpoint_path"]):
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    resume["checkpoint_path"]))
            checkpoint = torch.load(resume["checkpoint_path"])
            self.start_epoch = checkpoint['epoch']
            if self.use_cuda:
                self.model.module.load_state_dict(checkpoint['state_dict'])
            else:
                self.model.load_state_dict(checkpoint['state_dict'])
            if resume["fine_tuning"]:
                # resume params of optimizer, if run fine tuning.
                self.optimizer.load_state_dict(checkpoint['optimizer'])
                self.start_epoch = 0
            self.best_pred = checkpoint['best_pred']
            print("=> loaded checkpoint '{}' (epoch {})".format(
                resume["checkpoint_path"], checkpoint['epoch']))

    def _run_epoch(self,
                   epoch,
                   mode="train",
                   leave_progress=True,
                   use_optuna=False):
        """
        run training or validation 1 epoch.
        You don't have to change almost of this method.
        
        args:
            epoch = (int) How many epochs this time.
            mode = {"train" or "val"}
            leave_progress = {True or False} Can choose whether leave progress bar or not.
            use_optuna = {True or False} Can choose whether use optuna or not.
        
        Change point (if you need):
        - Evaluation: You can change metrics of monitoring.
        - writer.add_scalar: You can change metrics to be saved in tensorboard.
        """
        # ------------------------- #
        leave_progress = leave_progress and not use_optuna
        # Initializing
        epoch_loss = 0.0
        ## Set model mode & tqdm (progress bar; it wrap dataloader)
        assert (mode == "train") or (
            mode == "val"
        ), "argument 'mode' can be 'train' or 'val.' Not {}.".format(mode)
        if mode == "train":
            data_loader = self.tqdm(
                self.train_loader,
                leave=leave_progress) if self.use_tqdm else self.train_loader
            self.model.train()
            num_dataset = len(self.train_loader)
        elif mode == "val":
            data_loader = self.tqdm(
                self.val_loader,
                leave=leave_progress) if self.use_tqdm else self.val_loader
            self.model.eval()
            num_dataset = len(self.val_loader)
        ## Reset confusion matrix of evaluator
        self.evaluator.reset()

        # ------------------------- #
        # Run 1 epoch
        for i, sample in enumerate(data_loader):
            ## ***Get Input data***
            inputs, target = sample["input"], sample["label"]
            if self.use_cuda:
                inputs, target = inputs.cuda(), target.cuda()

            ## ***Calculate Loss <Train>***
            if mode == "train":
                self.optimizer.zero_grad()
                output = self.model(inputs)
                loss = self.criterion(output, target)
                loss.backward()
                self.optimizer.step()
            ## ***Calculate Loss <Validation>***
            elif mode == "val":
                with torch.no_grad():
                    output = self.model(inputs)
                loss = self.criterion(output, target)
            epoch_loss += loss.item()
            ## ***Report results***
            if self.use_tqdm:
                data_loader.set_description('{} loss: {:.3f}'.format(
                    mode, epoch_loss / (i + 1)))
            ## ***Add batch results into evaluator***
            target = target.cpu().numpy()
            output = torch.argmax(output, axis=1).data.cpu().numpy()
            self.evaluator.add_batch(target, output)

        ## **********Evaluate Score**********
        """You can add new metrics! <utils.metrics.Evaluator()>"""
        # Acc = self.evaluator.Accuracy()
        miou = self.evaluator.Mean_Intersection_over_Union()

        if not use_optuna:
            ## ***Save eval into Tensorboard***
            self.writer.add_scalar('{}/loss_epoch'.format(mode),
                                   epoch_loss / (i + 1), epoch)
            # self.writer.add_scalar('{}/Acc'.format(mode), Acc, epoch)
            self.writer.add_scalar('{}/miou'.format(mode), miou, epoch)
            print('Total {} loss: {:.3f}'.format(mode,
                                                 epoch_loss / num_dataset))
            print("{0} mIoU:{1:.2f}".format(mode, miou))

        # Return score to watch. (update checkpoint or optuna's objective)
        return miou

    def run(self, leave_progress=True, use_optuna=False):
        """
        Run all epochs of training and validation.
        """
        for epoch in tqdm(range(self.start_epoch, self.epochs)):
            print(pycolor.GREEN + "[Epoch: {}]".format(epoch) + pycolor.END)

            ## ***Train***
            print(pycolor.YELLOW + "Training:" + pycolor.END)
            self._run_epoch(epoch,
                            mode="train",
                            leave_progress=leave_progress,
                            use_optuna=use_optuna)
            ## ***Validation***
            print(pycolor.YELLOW + "Validation:" + pycolor.END)
            score = self._run_epoch(epoch,
                                    mode="val",
                                    leave_progress=leave_progress,
                                    use_optuna=use_optuna)
            print("---------------------")
            if score > self.best_pred:
                print("model improve best score from {:.4f} to {:.4f}.".format(
                    self.best_pred, score))
                self.best_pred = score
                self.saver.save_checkpoint({
                    'epoch':
                    epoch + 1,
                    'state_dict':
                    self.model.state_dict(),
                    'optimizer':
                    self.optimizer.state_dict(),
                    'best_pred':
                    self.best_pred,
                })
        self.writer.close()
        return self.best_pred
Exemplo n.º 7
0
def train(args):

    os.makedirs(args.checkpoint_dir, exist_ok=True)
    os.makedirs(args.ema_checkpoint_dir, exist_ok=True)

    train_dataset = WaveRNNDataset(args.input,
                                   upsample_factor=hop_length,
                                   local_condition=True,
                                   global_condition=False)

    device = torch.device("cuda" if args.use_cuda else "cpu")
    model = create_model(args)

    print(model)

    num_gpu = torch.cuda.device_count() if args.use_cuda else 1

    model.train(mode=True)

    global_step = 0

    parameters = list(model.parameters())
    optimizer = optim.Adam(parameters, lr=args.learning_rate)

    writer = SummaryWriter(args.checkpoint_dir)

    model.to(device)

    if args.resume is not None:
        restore_step = attempt_to_restore(model, optimizer, args.resume,
                                          args.use_cuda)
        global_step = restore_step

    ema = ExponentialMovingAverage(args.ema_decay)
    register_model_to_ema(model, ema)

    customer_optimizer = Optimizer(optimizer, args.learning_rate, global_step,
                                   args.warmup_steps, args.decay_learning_rate)

    criterion = nn.NLLLoss().to(device)

    for epoch in range(args.epochs):

        collate = WaveRNNCollate(upsample_factor=hop_length,
                                 condition_window=args.condition_window,
                                 local_condition=True,
                                 global_condition=False)

        train_data_loader = DataLoader(train_dataset,
                                       collate_fn=collate,
                                       batch_size=args.batch_size,
                                       num_workers=args.num_workers,
                                       shuffle=True,
                                       pin_memory=True)

        #train one epoch
        for batch, (coarse, fine, condition) in enumerate(train_data_loader):

            start = time.time()
            batch_size = int(condition.shape[0] // num_gpu * num_gpu)

            coarse = coarse[:batch_size, :].to(device)
            fine = fine[:batch_size, :].to(device)
            condition = condition[:batch_size, :, :].to(device)
            inputs = torch.cat([
                coarse[:, :-1].unsqueeze(-1), fine[:, :-1].unsqueeze(-1),
                coarse[:, 1:].unsqueeze(-1)
            ],
                               dim=-1)
            inputs = 2 * inputs.float() / 255 - 1.0

            if num_gpu > 1:
                out_c, out_f, _ = parallel(model, (inputs, condition))
            else:
                out_c, out_f, _ = model(inputs, condition)

            loss_c = criterion(out_c.transpose(1, 2).float(), coarse[:, 1:])
            loss_f = criterion(out_f.transpose(1, 2).float(), fine[:, 1:])
            loss = loss_c + loss_f

            global_step += 1
            customer_optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(parameters, max_norm=0.5)
            customer_optimizer.step_and_update_lr()
            model.after_update()

            if ema is not None:
                apply_moving_average(model, ema)

            print(
                "Step: {} --loss_c: {:.3f} --loss_f: {:.3f} --Lr: {:g} --Time: {:.2f} seconds"
                .format(global_step, loss_c, loss_f, customer_optimizer.lr,
                        float(time.time() - start)))

            if global_step % args.checkpoint_step == 0:
                save_checkpoint(args, model, optimizer, global_step, ema)

            if global_step % args.summary_step == 0:
                writer.add_scalar("loss", loss.item(), global_step)
                writer.add_scalar("loss_c", loss_c.item(), global_step)
                writer.add_scalar("loss_f", loss_f.item(), global_step)