Пример #1
0
    def train(train_instances, dev_instances, model, config, logger):
        train_data = CustomDataset(data=train_instances)

        sampler = RandomSampler(train_data)

        batch_size = config.batch_size
        iterator = trange(config.num_epochs, desc='Epoch', disable=False)
        data_loader = DataLoader(dataset=train_data, sampler=sampler, batch_size=batch_size,
                                 collate_fn=CustomCollate.collate, pin_memory=True, num_workers=1)

        optimizer = RAdam(model.parameters(), lr=config.learning_rate)

        logger.info('***** Start Training *****')
        torch.autograd.set_detect_anomaly(True)
        model.train()
        losses = []
        best_eval_loss = 10000
        best_epoch = -1
        best_model = None
        for epoch in iterator:
            logger.info('***** Epoch: {} *****'.format(epoch))
            total_loss = 0.0
            total_items = 0
            for _, batch in enumerate(data_loader):
                batch = batch_to_device(batch, config.device)
                model.to(config.device)
                model.train()
                model.zero_grad()
                output = model(batch)
                logliks = output[LOG_LIKELIHOOD]
                loss = -logliks.sum() / output[BATCH_SIZE]
                loss.backward()
                optimizer.step()
                total_loss += -logliks.sum().item()
                total_items += output[BATCH_SIZE]
            total_loss /= total_items
            losses.append(total_loss)
            logger.info('Train-Loss:{}'.format(total_loss))

            # eval
            eval_result = Evaluator.evaluate(dev_instances, model, config, logger)
            eval_loss = eval_result[TOTAL_LOSS]
            if eval_loss < best_eval_loss:
                logger.info('Update model')
                best_eval_loss = eval_loss
                best_epoch = epoch
                best_model = copy.deepcopy(model)
            else:
                if config.patience < epoch - best_epoch:
                    logger.info('Early stopping, Best Epoch: {}'.format(best_epoch))
                    break
        logger.info('End Training, Best Epoch: {}'.format(best_epoch))
        model_filename = Trainer.save_model(config, best_model, best_epoch)
        return best_model, model_filename
Пример #2
0
 def configure_optimizers(self):
     opt = RAdam(
         self.model.parameters(),
         lr=self.hp.lr,
         weight_decay=self.hp.weight_decay,
     )
     return [opt]
Пример #3
0
 def configure_optimizers(self):
     self.optimizer = RAdam(self.parameters(), lr=self.cfg.train.lr, weight_decay=2e-5)
     warmup_epo = 1
     warmup_factor = 10
     scheduler_cos = CosineAnnealingLR(self.optimizer, T_max=self.cfg.train.epoch - warmup_epo, eta_min=0)
     self.scheduler = GradualWarmupScheduler(self.optimizer, multiplier=warmup_factor,
                                             total_epoch=warmup_epo, after_scheduler=scheduler_cos)
     return [self.optimizer], [self.scheduler]
Пример #4
0
    def configure_optimizers(self):
        optimizer = {
            "sgd":
            FusedSGD(self.parameters(),
                     lr=self.lr,
                     momentum=self.args.momentum),
            "adam":
            FusedAdam(self.parameters(),
                      lr=self.lr,
                      weight_decay=self.args.weight_decay),
            "adamw":
            torch.optim.AdamW(self.parameters(),
                              lr=self.lr,
                              weight_decay=self.args.weight_decay),
            "radam":
            RAdam(self.parameters(),
                  lr=self.lr,
                  weight_decay=self.args.weight_decay),
            "adabelief":
            AdaBelief(self.parameters(),
                      lr=self.lr,
                      weight_decay=self.args.weight_decay),
            "adabound":
            AdaBound(self.parameters(),
                     lr=self.lr,
                     weight_decay=self.args.weight_decay),
            "adamp":
            AdamP(self.parameters(),
                  lr=self.lr,
                  weight_decay=self.args.weight_decay),
            "novograd":
            FusedNovoGrad(self.parameters(),
                          lr=self.lr,
                          weight_decay=self.args.weight_decay),
        }[self.args.optimizer.lower()]

        if not self.args.use_scheduler:
            return optimizer

        scheduler = {
            "scheduler":
            NoamLR(
                optimizer=optimizer,
                warmup_epochs=self.args.warmup,
                total_epochs=self.args.epochs,
                steps_per_epoch=len(self.train_dataloader()) // self.args.gpus,
                init_lr=self.args.init_lr,
                max_lr=self.args.lr,
                final_lr=self.args.final_lr,
            ),
            "interval":
            "step",
            "frequency":
            1,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler}
Пример #5
0
    def get(self, params, optimizer_name):
        """
        Creates torch optimizer specified by 'optimizer_name' for given 'params'.

        params: list of torch.nn.parameter.Parameter
        optimizer_name: str
        """
        if optimizer_name == "sgd":
            optimizer = SGD(
                params,
                lr=self.learning_rate,
                momentum=self.momentum,
                weight_decay=self.weight_decay,
            )
        elif optimizer_name == "adam":
            optimizer = Adam(
                params,
                lr=self.learning_rate,
                betas=tuple(self.betas),
                eps=self.eps,
                weight_decay=self.weight_decay,
                amsgrad=self.amsgrad,
            )
        elif optimizer_name == "adabound":
            optimizer = AdaBound(
                params,
                lr=self.learning_rate,
                betas=tuple(self.betas),
                final_lr=self.adabound_final_lr,
                gamma=self.adabound_gamma,
                eps=self.eps,
                weight_decay=self.weight_decay,
                amsbound=self.amsgrad,
            )
        elif optimizer_name == "lamb":
            optimizer = Lamb(
                params,
                lr=self.learning_rate,
                betas=tuple(self.betas),
                eps=self.eps,
                weight_decay=self.weight_decay,
            )
        elif optimizer_name == "radam":
            optimizer = RAdam(
                params,
                lr=self.learning_rate,
                betas=tuple(self.betas),
                eps=self.eps,
                weight_decay=self.weight_decay,
            )
        else:
            Exception(
                "Invalid OPTIMIZER, try: 'adam', 'sgd', 'adabound', 'lamb' or 'radam'"
            )
        return optimizer
Пример #6
0
    def configure_optimizers(self):
        if self.args.hpus:
            self.model = self.model.to(get_device(self.args))
            permute_params(self.model, True, self.args.run_lazy_mode)
        # Avoid instantiate optimizers if not have to
        # since might not be supported
        if self.args.optimizer.lower() == 'sgd':
            optimizer = SGD(self.parameters(),
                            lr=self.learning_rate,
                            momentum=self.args.momentum)
        elif self.args.optimizer.lower() == 'adam':
            optimizer = Adam(self.parameters(),
                             lr=self.learning_rate,
                             weight_decay=self.args.weight_decay)
        elif self.args.optimizer.lower() == 'radam':
            optimizer = RAdam(self.parameters(),
                              lr=self.learning_rate,
                              weight_decay=self.args.weight_decay)
        elif self.args.optimizer.lower() == 'adamw':
            optimizer = torch.optim.AdamW(self.parameters(),
                                          lr=self.learning_rate,
                                          weight_decay=self.args.weight_decay)
        elif self.args.optimizer.lower() == 'fusedadamw':
            from habana_frameworks.torch.hpex.optimizers import FusedAdamW
            optimizer = FusedAdamW(self.parameters(),
                                   lr=self.learning_rate,
                                   eps=1e-08,
                                   weight_decay=self.args.weight_decay)
        else:
            assert False, "optimizer {} not suppoerted".format(
                self.args.optimizer.lower())

        scheduler = {
            "none":
            None,
            "multistep":
            torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                 self.args.steps,
                                                 gamma=self.args.factor),
            "cosine":
            torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                       self.args.max_epochs),
            "plateau":
            torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                factor=self.args.factor,
                patience=self.args.lr_patience),
        }[self.args.scheduler.lower()]

        opt_dict = {"optimizer": optimizer, "monitor": "val_loss"}
        if scheduler is not None:
            opt_dict.update({"lr_scheduler": scheduler})
        return opt_dict
    def configure_optimizers(self):
        params = self.model.parameters()

        if self.hp.optim == 'fused_adam':
            from apex.optimizers import FusedAdam
            opt = FusedAdam(
                params,
                lr=self.hp.lr,
                weight_decay=self.hp.weight_decay,
            )
            sched = {
                'scheduler':
                OneCycleLR(
                    opt,
                    max_lr=self.hp.lr,
                    total_steps=self.total_steps,
                ),
                'interval':
                'step',
            }
        elif self.hp.optim == 'radam':
            opt = RAdam(
                params,
                lr=self.hp.lr,
                weight_decay=self.hp.weight_decay,
            )
            # noinspection PyTypeChecker
            sched = {
                'scheduler':
                LambdaLR(
                    opt,
                    lr_lambda=partial(
                        flat_cos,
                        total_steps=self.total_steps,
                    ),
                ),
                'interval':
                'step',
            }
        else:
            raise Exception

        return [opt], [sched]
Пример #8
0
    def configure_optimizers(self):
        optimizer = {
            "sgd":
            FusedSGD(self.parameters(),
                     lr=self.learning_rate,
                     momentum=self.args.momentum),
            "adam":
            FusedAdam(self.parameters(),
                      lr=self.learning_rate,
                      weight_decay=self.args.weight_decay),
            "radam":
            RAdam(self.parameters(),
                  lr=self.learning_rate,
                  weight_decay=self.args.weight_decay),
        }[self.args.optimizer.lower()]

        scheduler = {
            "none":
            None,
            "multistep":
            torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                 self.args.steps,
                                                 gamma=self.args.factor),
            "cosine":
            torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                       self.args.max_epochs),
            "plateau":
            torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                factor=self.args.factor,
                patience=self.args.lr_patience),
        }[self.args.scheduler.lower()]

        opt_dict = {"optimizer": optimizer, "monitor": "val_loss"}
        if scheduler is not None:
            opt_dict.update({"lr_scheduler": scheduler})
        return opt_dict
Пример #9
0
    def configure_optimizers(self):
        optimizer = Lookahead(
            RAdam(self.parameters(),
                  lr=0.001,
                  weight_decay=WEIGHT_DECAY,
                  eps=1e-5))
        schedule = {
            'scheduler':
            OneCycleLRLookahead(optimizer,
                                max_lr=MAX_LR,
                                epochs=EPOCHS,
                                steps_per_epoch=int(
                                    len(self._trainval[b'filenames']) /
                                    BATCH_SIZE),
                                verbose=False),
            'name':
            'learning_rate',
            'interval':
            'step',
            'frequency':
            1
        }

        return [optimizer], [schedule]
Пример #10
0
def main(args: Dict[str, Any]):
    start = time.time()

    # Intialize config
    config_path: str = args["config"]
    with open(config_path, "r", encoding="utf-8") as f:
        config: Dict[str, Any] = yaml.safe_load(f)
    logger.info(f"Loaded config at: {config_path}")
    logger.info(f"{pformat(config)}")


    # Initialize device
    if args["use_gpu"] and torch.cuda.is_available():
        device: torch.device = torch.device("cuda:0")
    else:
        device = torch.device("cpu")


    # Intialize model
    model = nn.DataParallel(Resnet50(
        embedding_size=config["embedding_size"],
        pretrained=config["pretrained"]
    ))
    model = model.to(device)
    logger.info(f"Initialized model: {model}")


    # Initialize optimizer
    optimizer = RAdam(model.parameters(), lr=config["lr"])
    logger.info(f"Initialized optimizer: {optimizer}")


    # Initialize train transforms
    transform_train = T.Compose([
        T.Resize((config["image_size"], config["image_size"])),
        T.RandomHorizontalFlip(),
        T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
        T.RandomAffine(degrees=5, scale=(0.8, 1.2), translate=(0.2, 0.2)),
        T.ToTensor(),
        T.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        ),
    ])
    logger.info(f"Initialized training transforms: {transform_train}")


    # Initialize training set
    train_set = Dataset(args["train_dir"], transform=transform_train)

    if args["loss"] == "tripletloss":
        # Initialize train loader for triplet loss
        batch_size: int = config["classes_per_batch"] * config["samples_per_class"]
        train_loader = DataLoader(
            train_set,
            batch_size,
            sampler=PKSampler(
                train_set.targets,
                config["classes_per_batch"],
                config["samples_per_class"]
            ),
            shuffle=False,
            num_workers=args["n_workers"],
            pin_memory=True,
        )
        logger.info(f"Initialized train_loader: {train_loader.dataset}")

        # Intialize loss function
        loss_function = TripletMarginLoss(
            margin=config["margin"],
            sampling_type=config["sampling_type"]
        )
        logger.info(f"Initialized training loss: {loss_function}")

    elif args["loss"] == "proxy_nca":
        # Initialize train loader for proxy-nca loss
        batch_size: int = config["batch_size"]
        train_loader = DataLoader(
            train_set,
            config["batch_size"],
            shuffle=True,
            num_workers=args["n_workers"],
            pin_memory=True,
        )
        logger.info(f"Initialized train_loader: {train_loader.dataset}")

        loss_function = ProxyNCALoss(
            n_classes=len(train_set.classes),
            embedding_size=config["embedding_size"],
            embedding_scale=config["embedding_scale"],
            proxy_scale=config["proxy_scale"],
            smoothing_factor=config["smoothing_factor"],
            device=device
        )

    elif args["loss"] == "proxy_anchor":
        # Intialize train loader for proxy-anchor loss
        batch_size: int = config["batch_size"]
        train_loader = DataLoader(
            train_set,
            config["batch_size"],
            shuffle=True,
            num_workers=args["n_workers"],
            pin_memory=True,
        )
        logger.info(f"Initialized train_loader: {train_loader.dataset}")

        loss_function = ProxyAnchorLoss(
            n_classes=len(train_set.classes),
            embedding_size=config["embedding_size"],
            margin=config["margin"],
            alpha=config["alpha"],
            device=device
        )

    elif args["loss"] == "soft_triple":
        # Intialize train loader for proxy-anchor loss
        batch_size: int = config["batch_size"]
        train_loader = DataLoader(
            train_set,
            config["batch_size"],
            shuffle=True,
            num_workers=args["n_workers"],
            pin_memory=True,
        )
        logger.info(f"Initialized train_loader: {train_loader.dataset}")

        loss_function = SoftTripleLoss(
            n_classes=len(train_set.classes),
            embedding_size=config["embedding_size"],
            n_centers_per_class=config["n_centers_per_class"],
            lambda_=config["lambda"],
            gamma=config["gamma"],
            tau=config["tau"],
            margin=config["margin"],
            device=device
        )
    else:
        raise Exception("Only the following losses is supported: "
                        "['tripletloss', 'proxy_nca', 'proxy_anchor', 'soft_triple']. "
                        f"Got {args['loss']}")


    # Initialize test transforms
    transform_test = T.Compose([
        T.Resize((config["image_size"], config["image_size"])),
        T.ToTensor(),
        T.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        ),
    ])
    logger.info(f"Initialized test transforms: {transform_test}")


    # Initialize test set and test loader
    test_dataset = Dataset(args["test_dir"], transform=transform_test)
    test_loader = DataLoader(
        test_dataset, batch_size,
        shuffle=False,
        num_workers=args["n_workers"],
    )
    logger.info(f"Initialized test_loader: {test_loader.dataset}")


    # Initialize reference set and reference loader
    # If reference set is not given, use train set as reference set, but without random sampling
    if not args["reference_dir"]:
        reference_set = Dataset(args["train_dir"], transform=transform_test)
    else:
        reference_set = Dataset(args["reference_dir"], transform=transform_test)
    # Sometimes reference set is too large to fit into memory,
    # therefore we only sample a subset of it.
    n_samples_per_reference_class: int = args["n_samples_per_reference_class"]
    if n_samples_per_reference_class > 0:
        reference_set = get_subset_from_dataset(reference_set, n_samples_per_reference_class)

    reference_loader = DataLoader(
        reference_set, batch_size,
        shuffle=False,
        num_workers=args["n_workers"],
    )
    logger.info(f"Initialized reference set: {reference_loader.dataset}")


    # Initialize checkpointing directory
    checkpoint_dir: str = os.path.join(args["checkpoint_root_dir"], CURRENT_TIME)
    writer = SummaryWriter(log_dir=checkpoint_dir)
    logger.info(f"Created checkpoint directory at: {checkpoint_dir}")


    # Dictionary contains all metrics
    output_dict: Dict[str, Any] = {
        "total_epoch": args["n_epochs"],
        "current_epoch": 0,
        "current_iter": 0,
        "metrics": {
            "mean_average_precision": 0.0,
            "average_precision_at_1": 0.0,
            "average_precision_at_5": 0.0,
            "average_precision_at_10": 0.0,
            "top_1_accuracy": 0.0,
            "top_5_accuracy": 0.0,
            "normalized_mutual_information": 0.0,
        }
    }
    # Start training and testing
    logger.info("Start training...")
    for _ in range(1, args["n_epochs"] + 1):
        output_dict = train_one_epoch(
            model, optimizer, loss_function,
            train_loader, test_loader, reference_loader,
            writer, device, config,
            checkpoint_dir,
            args['log_frequency'],
            args['validate_frequency'],
            output_dict
        )
    logger.info(f"DONE TRAINING {args['n_epochs']} epochs")


    # Visualize embeddings
    logger.info("Calculating train embeddings for visualization...")
    log_embeddings_to_tensorboard(train_loader, model, device, writer, tag="train")
    logger.info("Calculating reference embeddings for visualization...")
    log_embeddings_to_tensorboard(reference_loader, model, device, writer, tag="reference")
    logger.info("Calculating test embeddings for visualization...")
    log_embeddings_to_tensorboard(test_loader, model, device, writer, tag="test")


    # Visualize model's graph
    logger.info("Adding graph for visualization")
    with torch.no_grad():
        dummy_input = torch.zeros(1, 3, config["image_size"], config["image_size"]).to(device)
        writer.add_graph(model.module.features, dummy_input)


    # Save all hyper-parameters and corresponding metrics
    logger.info("Saving all hyper-parameters")
    writer.add_hparams(
        config,
        metric_dict={f"hyperparams/{key}": value for key, value in output_dict["metrics"].items()}
    )
    with open(os.path.join(checkpoint_dir, "output_dict.json"), "w") as f:
        json.dump(output_dict, f, indent=4)
    logger.info(f"Dumped output_dict.json at {checkpoint_dir}")


    end = time.time()
    logger.info(f"EVERYTHING IS DONE. Training time: {round(end - start, 2)} seconds")
Пример #11
0
def main():
    anchors = [30, 54, 95]
    shuffle = not (args.no_shuffle)
    exp = args.exp
    warm_up_epoch = 3

    # Load and process data

    if args.fold:
        df_train = pd.read_csv(args.data_path +
                               'k_fold/official_train_fold%d.csv' %
                               (args.fold))
        df_val = pd.read_csv(args.data_path +
                             'k_fold/official_val_fold%d.csv' % (args.fold))
    else:
        df_train = pd.read_csv(args.data_path + 'official_train.csv')
        df_val = pd.read_csv(args.data_path + 'official_val.csv')

    train = df_train.image_path.to_list()
    val = df_val.image_path.to_list()
    if exp:
        y_train = df_train.anchor.to_list()
        y_val = df_val.anchor.to_list()
        reg_train_gt = df_train.exp_wind.to_list()
        reg_val_gt = df_val.exp_wind.to_list()
    else:
        y_train = df_train.wind_speed.to_list()
        y_val = df_val.wind_speed.to_list()

    train_transform, val_transform = get_transform(args.image_size)

    train_dataset = WindDataset(image_list=train,
                                target=y_train,
                                exp_target=reg_train_gt if exp else None,
                                transform=train_transform)

    val_dataset = WindDataset(image_list=val,
                              target=y_val,
                              exp_target=reg_val_gt if exp else None,
                              transform=val_transform)

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=shuffle,
                              num_workers=args.num_workers,
                              drop_last=True)

    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.num_workers,
                            drop_last=True)

    warm_loader = DataLoader(dataset=train_dataset,
                             batch_size=args.batch_size * 14,
                             shuffle=shuffle,
                             num_workers=args.num_workers,
                             drop_last=True)

    # Load model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    last_epoch = 0

    # model = ResNet50_BN_idea()
    if not exp:
        model = Effnet_Wind_B7()
        # model = Effnet_Wind_B5()
    else:
        model = Effnet_Wind_B5_exp_6()
    # model = ResNetExample()
    # if not exp:
    #     model = Seresnext_Wind()
    # else:
    #     model = Seresnext_Wind_Exp()

    # Optimizer
    if args.opt == 'radam':
        optimizer = RAdam(
            model.parameters(),
            lr=args.lr,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=args.weight_decay,
        )
    elif args.opt == 'adamw':
        optimizer = AdamW(model.parameters(), args.lr)

    elif args.opt == 'adam':
        optimizer = Adam(model.parameters(),
                         args.lr,
                         weight_decay=args.weight_decay)
    else:
        optimizer = SGD(model.parameters(),
                        args.lr,
                        momentum=0.9,
                        nesterov=True,
                        weight_decay=args.weight_decay)

    if args.weights:
        # model.load_state_dict(torch.load(args.weights))
        last_epoch = extract_number(args.weights)
        try:
            checkpoint = torch.load(args.weights)
            model.load_state_dict(checkpoint['model_state_dict'])
            if checkpoint['pre_opt'] == args.opt:
                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                print(optimizer)
        except:
            model.load_state_dict(torch.load(args.weights))
    else:
        model.apply(reset_m_batchnorm)

    model.to(device)

    # Loss function
    if exp:
        criterion = JointLoss2()
    else:
        criterion = RMSELoss()

    # generate log and visualization
    save_path = args.save_path

    log_cache = (args.batch_size, args.image_size, shuffle, exp)

    write_log(args.save_path, model, optimizer, criterion, log_cache)

    plot_dict = {'train': list(), 'val': list()}

    log_train_path = save_path + 'training_log.txt'
    plot_train_path = save_path + 'log.json'

    write_mode = 'w'

    if os.path.exists(log_train_path) and os.path.exists(plot_train_path):
        write_mode = 'a'
        with open(plot_train_path, 'r') as j:
            plot_dict = json.load(j)
            plot_dict['train'] = plot_dict['train'][:last_epoch]
            plot_dict['val'] = plot_dict['val'][:last_epoch]

    # Training
    print('Start warm up')
    model.freeze_except_last()
    for epoch in range(warm_up_epoch):
        warm_up(
            model=model,
            dataloader=warm_loader,
            optimizer=optimizer,
            criterion=criterion,
            device=device,
        )
    model.unfreeze()
    with open(log_train_path, write_mode) as f:
        for epoch in range(1, args.epoch + 1):
            print('Epoch:', epoch + last_epoch)
            f.write('Epoch: %d\n' % (epoch + last_epoch))
            loss = train_epoch(model=model,
                               dataloader=train_loader,
                               optimizer=optimizer,
                               criterion=criterion,
                               device=device,
                               exp=exp)
            RMSE = val_epoch(model=model,
                             dataloader=val_loader,
                             device=device,
                             exp=exp,
                             anchors=anchors)
            if not exp:
                f.write('Training loss: %.4f\n' % (loss))
                f.write('RMSE val: %.4f\n' % (RMSE))
                print('RMSE loss: %.4f' % (loss))
                print('RMSE val: %.4f' % (RMSE))
            else:
                loss, classify, regress = loss
                RMSE, accuracy = RMSE
                f.write('Training loss: %.4f\n' % (loss))
                f.write('Classification loss: %.4f\n' % (classify))
                f.write('Regression loss: %.4f\n' % (regress))
                f.write('Accuracy val: %.4f\n' % (accuracy))
                f.write('RMSE val: %.4f\n' % (RMSE))
                print('Training loss: %.4f' % (loss))
                print('Classification loss: %.4f' % (classify))
                print('Regression loss: %.4f' % (regress))
                print('Accuracy val: %.4f' % (accuracy))
                print('RMSE val: %.4f' % (RMSE))

            # torch.save(model.state_dict(), save_path + 'epoch%d.pth'%(epoch+last_epoch))
            save_name = save_path + 'epoch%d.pth' % (epoch + last_epoch)
            save_pth(save_name, epoch + last_epoch, model, optimizer, args.opt)

            plot_dict['train'].append(loss)
            plot_dict['val'].append(RMSE)
            with open(plot_train_path, 'w') as j:
                json.dump(plot_dict, j)
Пример #12
0
def train(args, cfg):
    device = torch.device('cuda')
    model = ModelWithLoss(cfg).to(device)
    print('------------Model Architecture-------------')
    print(model)

    print('Loading Datasets...')
    data_loader = {}

    if cfg.SOLVER.AUGMENTATION:
        train_transforms = SyntheticTransforms()
    else:
        train_transforms = ToTensor()
        
    if cfg.DATASET.TRACK == 'synthetic':
        train_dataset = SyntheticBurst(ZurichRAW2RGB(cfg.DATASET.TRAIN_SYNTHETIC), crop_sz=cfg.SOLVER.PATCH_SIZE, burst_size=cfg.MODEL.BURST_SIZE, transform=train_transforms)
    elif cfg.DATASET.TRACK == 'real':
        train_dataset = BurstSRDataset(cfg.DATASET.REAL, split='train', crop_sz=cfg.SOLVER.PATCH_SIZE // 8, burst_size=cfg.MODEL.BURST_SIZE)
    sampler = RandomSampler(train_dataset)
    batch_sampler = BatchSampler(sampler=sampler, batch_size=cfg.SOLVER.BATCH_SIZE, drop_last=True)
    batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations=cfg.SOLVER.MAX_ITER)
    train_loader = DataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=batch_sampler, pin_memory=True)

    data_loader['train'] = train_loader

    # if args.eval_step != 0:
    #     val_transforms =
    #     val_dataset =
    #     sampler = SequentialSampler(val_dataset)
    #     batch_sampler = BatchSampler(sampler=sampler, batch_size=args.batch_size, drop_last=False)
    #     val_loader = DataLoader(val_dataset, num_workers=args.num_workers, batch_sampler=batch_sampler)

    #     data_loader['val'] = val_loader

    if cfg.SOLVER.OPTIMIZER == 'radam':
        optimizer = RAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.SOLVER.LR)
    elif cfg.SOLVER.OPTIMIZER == 'adabound':
        optimizer = AdaBound(filter(lambda p:p.requires_grad, model.parameters()), lr=cfg.SOLVER.LR, final_lr=cfg.SOLVER.FINAL_LR)
    # optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.SOLVER.LR)
    # scheduler = MultiStepLR(optimizer, cfg.SOLVER.LR_STEP, gamma=0.1)
    scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.LR, cfg.SOLVER.LR_STEP, warmup_factor=cfg.SOLVER.WARMUP_FACTOR, warmup_iters=cfg.SOLVER.WARMUP_ITER)

    if args.resume_iter != 0:
        model_path = os.path.join(cfg.OUTPUT_DIR, 'model', 'iteration_{}.pth'.format(args.resume_iter))
        print(f'Resume from {model_path}')
        model.model.load_state_dict(fix_model_state_dict(torch.load(os.path.join(cfg.OUTPUT_DIR, 'model', 'iteration_{}.pth'.format(args.resume_iter)))))
        if model.flow_refine:
            FR_model_path = os.path.dirname(model_path)[:-5] + "FR_model/" + 'iteration_{}.pth'.format(args.resume_iter)
            model.FR_model.load_state_dict(torch.load(FR_model_path))
        if model.denoise_burst:
            denoise_model_path = os.path.dirname(model_path)[:-5] + "denoise_model/" + 'iteration_{}.pth'.format(args.resume_iter)
            model.denoise_model.load_state_dict(torch.load(denoise_model_path))
        optimizer.load_state_dict(torch.load(os.path.join(cfg.OUTPUT_DIR, 'optimizer', 'iteration_{}.pth'.format(args.resume_iter))))
        scheduler.load_state_dict(torch.load(os.path.join(cfg.OUTPUT_DIR, 'scheduler', 'iteration_{}.pth'.format(args.resume_iter))))
    elif cfg.SOLVER.PRETRAIN_MODEL != '':
        model_path = cfg.SOLVER.PRETRAIN_MODEL
        print(f'load pretrain model from {model_path}')
        model.model.load_state_dict(fix_model_state_dict(torch.load(model_path)))
        if model.flow_refine:
            FR_model_path = os.path.dirname(model_path)[:-5] + "FR_model/" + os.path.basename(cfg.SOLVER.PRETRAIN_MODEL)
            model.FR_model.load_state_dict(torch.load(FR_model_path))
        if model.denoise_burst:
            denoise_model_path = os.path.dirname(model_path)[:-5] + "denoise_model/" + os.path.basename(cfg.SOLVER.PRETRAIN_MODEL)
            model.denoise_model.load_state_dict(torch.load(denoise_model_path))

    if cfg.SOLVER.SYNC_BATCHNORM:
        model = convert_model(model).to(device)
    
    if args.num_gpus > 1:
        model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpus)))

    if not args.debug:
        summary_writer = SummaryWriter(log_dir=cfg.OUTPUT_DIR)
    else:
        summary_writer = None

    do_train(args, cfg, model, optimizer, scheduler, data_loader, device, summary_writer)
def train_loop(folds, fold):

    if CFG.device == 'GPU':
        LOGGER.info(f"========== fold: {fold} training ==========")
    elif CFG.device == 'TPU':
        if CFG.nprocs == 1:
            LOGGER.info(f"========== fold: {fold} training ==========")
        elif CFG.nprocs == 8:
            xm.master_print(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)

    train_folds = train_folds[train_folds['StudyInstanceUID'].isin(
        train_annotations['StudyInstanceUID'].unique())].reset_index(drop=True)

    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(train_folds,
                                 train_annotations,
                                 use_annot=True,
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds,
                                 train_annotations,
                                 use_annot=False,
                                 transform=get_transforms(data='valid'))

    if CFG.device == 'GPU':
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers,
                                  pin_memory=True,
                                  drop_last=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers,
                                  pin_memory=True,
                                  drop_last=False)

    elif CFG.device == 'TPU':
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=CFG.batch_size,
                                                   sampler=train_sampler,
                                                   drop_last=True,
                                                   num_workers=CFG.num_workers)

        valid_sampler = torch.utils.data.distributed.DistributedSampler(
            valid_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False)
        valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=CFG.batch_size *
                                                   2,
                                                   sampler=valid_sampler,
                                                   drop_last=False,
                                                   num_workers=CFG.num_workers)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler == 'ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer,
                                          mode='min',
                                          factor=CFG.factor,
                                          patience=CFG.patience,
                                          verbose=True,
                                          eps=CFG.eps)
        elif CFG.scheduler == 'CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer,
                                          T_max=CFG.T_max,
                                          eta_min=CFG.min_lr,
                                          last_epoch=-1)
        elif CFG.scheduler == 'CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer,
                                                    T_0=CFG.T_0,
                                                    T_mult=1,
                                                    eta_min=CFG.min_lr,
                                                    last_epoch=-1)
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    if CFG.device == 'TPU':
        device = xm.xla_device()
    elif CFG.device == 'GPU':
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    teacher_model = CustomSeResNet152D(CFG.model_name, pretrained=False)
    teacher_model.to(device)
    state = torch.load(CFG.teacher)
    teacher_model.load_state_dict(state['model'])
    for param in teacher_model.parameters():
        param.requires_grad = False
    teacher_model.eval()
    #     teacher_model.to(device)

    model = CustomSeResNet152D_WLF(CFG.model_name, pretrained=True)
    model.to(device)
    #     state = torch.load(CFG.student)
    #     model.load_state_dict(state['model'])

    optimizer = RAdam(model.parameters(),
                      lr=CFG.lr,
                      weight_decay=CFG.weight_decay)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # loop
    # ====================================================
    train_criterion = CustomLoss(weights=CFG.weights)
    valid_criterion = nn.BCEWithLogitsLoss()

    best_score = 0.
    best_loss = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        if CFG.device == 'TPU':
            if CFG.nprocs == 1:
                avg_loss = train_fn(train_loader, teacher_model, model,
                                    train_criterion, optimizer, epoch,
                                    scheduler, device)
            elif CFG.nprocs == 8:
                para_train_loader = pl.ParallelLoader(train_loader, [device])
                avg_loss = train_fn(
                    para_train_loader.per_device_loader(device), teacher_model,
                    model, train_criterion, optimizer, epoch, scheduler,
                    device)
        elif CFG.device == 'GPU':
            avg_loss = train_fn(train_loader, teacher_model, model,
                                train_criterion, optimizer, epoch, scheduler,
                                device)

        # eval
        if CFG.device == 'TPU':
            if CFG.nprocs == 1:
                avg_val_loss, preds, _ = valid_fn(valid_loader, model,
                                                  valid_criterion, device)
            elif CFG.nprocs == 8:
                para_valid_loader = pl.ParallelLoader(valid_loader, [device])
                avg_val_loss, preds, valid_labels = valid_fn(
                    para_valid_loader.per_device_loader(device), model,
                    valid_criterion, device)
                preds = idist.all_gather(torch.tensor(preds)).to('cpu').numpy()
                valid_labels = idist.all_gather(
                    torch.tensor(valid_labels)).to('cpu').numpy()
        elif CFG.device == 'GPU':
            avg_val_loss, preds, _ = valid_fn(valid_loader, model,
                                              valid_criterion, device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score, scores = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        if CFG.device == 'GPU':
            LOGGER.info(
                f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s'
            )
            LOGGER.info(
                f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}'
            )
        elif CFG.device == 'TPU':
            if CFG.nprocs == 1:
                LOGGER.info(
                    f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s'
                )
                LOGGER.info(
                    f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}'
                )
            elif CFG.nprocs == 8:
                xm.master_print(
                    f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s'
                )
                xm.master_print(
                    f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}'
                )

        if score > best_score:
            best_score = score
            if CFG.device == 'GPU':
                LOGGER.info(
                    f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model'
                )
                torch.save({
                    'model': model.state_dict(),
                    'preds': preds
                }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth')
            elif CFG.device == 'TPU':
                if CFG.nprocs == 1:
                    LOGGER.info(
                        f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model'
                    )
                elif CFG.nprocs == 8:
                    xm.master_print(
                        f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model'
                    )
                xm.save({
                    'model': model,
                    'preds': preds
                }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth')

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            if CFG.device == 'GPU':
                LOGGER.info(
                    f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
                torch.save({
                    'model': model.state_dict(),
                    'preds': preds
                }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth')
            elif CFG.device == 'TPU':
                if CFG.nprocs == 1:
                    LOGGER.info(
                        f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model'
                    )
                elif CFG.nprocs == 8:
                    xm.master_print(
                        f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model'
                    )
                xm.save({
                    'model': model,
                    'preds': preds
                }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth')

#         # inference用に全て保存しておく
#         if CFG.device == 'TPU':
#             xm.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth')
#         elif CFG.device == 'GPU':
#             torch.save({'model': model.state_dict()}, OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_epoch{epoch+1}.pth')

        if CFG.nprocs != 8:
            check_point = torch.load(
                OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth')
            for c in [f'pred_{c}' for c in CFG.target_cols]:
                valid_folds[c] = np.nan
            valid_folds[[f'pred_{c}'
                         for c in CFG.target_cols]] = check_point['preds']

    return valid_folds
Пример #14
0
def get_optimizer(
    model: nn.Module,
    optimizer_name: str,
    learning_rate: float,
    weight_decay: float = 1e-5,
    no_weight_decay_on_bias: bool = False,
    eps: float = 1e-5,
    **kwargs,
) -> Optimizer:
    """
    Construct an Optimizer for given model
    Args:
        model: Model to optimize. Only parameters that require_grad will be used
        optimizer_name: Name of the optimizer. Case-insensitive
        learning_rate: Target learning rate (regardless of the scheduler)
        weight_decay: Target weight decay
        no_weight_decay_on_bias: Whether to disable weight decay on bias parameters
        eps: Default epsilon for Adam-like optimizers.
        **kwargs: Additional parameters for optimizer

    Returns:

    """
    from torch.optim import ASGD, SGD, Adam, RMSprop, AdamW
    from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger

    # Optimizer parameter groups
    default_pg, biases_pg = [], []

    for k, v in model.named_parameters():
        if v.requires_grad:
            if str.endswith(k, ".bias"):
                biases_pg.append(v)  # biases
            else:
                default_pg.append(v)  # all else

    if no_weight_decay_on_bias:
        parameters = default_pg
    else:
        parameters = default_pg + biases_pg

    optimizer: Optimizer = None

    if optimizer_name.lower() == "sgd":
        optimizer = SGD(
            parameters,
            lr=learning_rate,
            momentum=0.9,
            nesterov=True,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "asgd":
        optimizer = ASGD(
            parameters,
            lr=learning_rate,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "adam":
        optimizer = Adam(
            parameters,
            lr=learning_rate,
            weight_decay=weight_decay,
            eps=eps,
            **kwargs,
        )
    elif optimizer_name.lower() == "rms":
        optimizer = RMSprop(parameters,
                            learning_rate,
                            weight_decay=weight_decay,
                            **kwargs)
    elif optimizer_name.lower() == "adamw":
        optimizer = AdamW(
            parameters,
            lr=learning_rate,
            weight_decay=weight_decay,
            eps=eps,
            **kwargs,
        )
    elif optimizer_name.lower() == "radam":
        optimizer = RAdam(
            parameters,
            lr=learning_rate,
            weight_decay=weight_decay,
            eps=eps,
            **kwargs,
        )
    elif optimizer_name.lower() == "ranger":
        optimizer = Ranger(
            parameters,
            lr=learning_rate,
            eps=eps,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "lamb":
        optimizer = Lamb(
            parameters,
            lr=learning_rate,
            eps=eps,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "diffgrad":
        optimizer = DiffGrad(
            parameters,
            lr=learning_rate,
            eps=eps,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "novograd":
        optimizer = NovoGrad(
            parameters,
            lr=learning_rate,
            eps=eps,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "fused_lamb":
        from apex.optimizers import FusedLAMB

        optimizer = FusedLAMB(parameters,
                              learning_rate,
                              eps=eps,
                              weight_decay=weight_decay,
                              **kwargs)
    elif optimizer_name.lower() == "fused_sgd":
        from apex.optimizers import FusedSGD

        optimizer = FusedSGD(parameters,
                             learning_rate,
                             momentum=0.9,
                             nesterov=True,
                             weight_decay=weight_decay,
                             **kwargs)
    elif optimizer_name.lower() == "fused_adam":
        from apex.optimizers import FusedAdam

        optimizer = FusedAdam(parameters,
                              learning_rate,
                              eps=eps,
                              weight_decay=weight_decay,
                              adam_w_mode=True,
                              **kwargs)
    else:
        raise KeyError(f"Cannot get optimizer by name {optimizer_name}")

    # Currently either no_wd or per-group lr
    if no_weight_decay_on_bias:
        optimizer.add_param_group({"params": biases_pg, "weight_decay": 0})

    return optimizer
def main(cfg: DictConfig):
    # Config  ################################################################
    IMAGE_NUM = cfg.data.image_num
    IMAGE_SIZE = cfg.data.image_size
    exp_name = cfg.data.exp
    model_name = f'efficientnet-{cfg.data.model_name}'
    BATCH_SIZE = cfg.training.batch_size
    lr = cfg.training.lr
    NUM_EPOCHS = cfg.training.num_epoch
    FOLD = cfg.training.fold
    OPTIMIZER = cfg.training.optimizer
    SCHEDULER = cfg.training.scheduler

    # Chenge Current Dir  ################################################################
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)

    # Data Loading  ################################################################
    # Background_rate = 0.7
    # img_path = glob.glob('./data/grid_256_level_1/img/*.jpg')
    img_path = glob.glob('./data/grid_128_level_1/img/*.jpg')

    # Labelデータの読み込み
    # meta = pd.read_csv('./data/input/train.csv')
    # meta = pd.read_csv('./data/input/modified_train.csv')   # 修正ver1
    meta = pd.read_csv(
        './data/input/modified_train_v2.csv')  # 修正ver2  (score_3, 4, 5の割合を考慮)

    # Data Augmentation
    transform = ImageTransform(img_size=IMAGE_SIZE)  # ImageSizeを指定

    # StratifiedKFold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    meta['fold'] = -1
    for i, (trn_idx, val_idx) in enumerate(cv.split(meta, meta['isup_grade'])):
        meta.loc[val_idx, 'fold'] = i

    # Dataset, DataLoader  ################################################################
    # multiがtrueの場合、すべてのfoldを使用。false(デフォルト)の場合は一つのfoldのみを使用
    dataloaders = get_dataloaders(meta,
                                  FOLD,
                                  img_path,
                                  transform,
                                  IMAGE_NUM,
                                  BATCH_SIZE,
                                  multi=cfg.training.multi_fold,
                                  binning=cfg.training.binning)

    # Model  ################################################################
    if cfg.training.binning:
        OUTPUTSIZE = 5
    else:
        OUTPUTSIZE = 6
    net = ModelEFN_2(model_name=model_name, output_size=OUTPUTSIZE)

    # Set Weight
    # model_path = './weights/efn_b0_fromjpg_augtile_04_epoch_18_loss_1.191_kappa_0.716.pth'
    # net.load_state_dict(torch.load(model_path, map_location=device))

    # criterion = nn.CrossEntropyLoss(reduction='mean')
    criterion = nn.BCEWithLogitsLoss()
    # criterion = QWKLoss()

    opt_dict = {
        'adam':
        optim.Adam(net.parameters(), lr=lr),
        'radam':
        RAdam(net.parameters(), lr=lr),
        'sgd':
        optim.SGD(net.parameters(), lr=lr, weight_decay=0.0001, momentum=0.9)
    }
    optimizer = opt_dict[OPTIMIZER]

    sch_dict = {
        'step':
        StepLR(optimizer, step_size=4, gamma=0.5),
        'cos':
        CosineAnnealingWarmRestarts(optimizer,
                                    T_0=5,
                                    T_mult=2,
                                    eta_min=lr * 0.1),
        'cos_2':
        CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS, eta_min=0),
        'none':
        None,
        'warmup':
        None
    }
    scheduler = sch_dict[SCHEDULER]

    if SCHEDULER == 'warmup':
        del optimizer, scheduler
        warmup_factor = 10
        warmup_epo = 1
        if OPTIMIZER == 'adam':
            optimizer = optim.Adam(net.parameters(), lr=lr / warmup_factor)
        elif OPTIMIZER == 'radam':
            optimizer = RAdam(net.parameters(), lr=lr / warmup_factor)
        else:
            optimizer = optim.SGD(net.parameters(),
                                  lr=lr,
                                  weight_decay=0.0001,
                                  momentum=0.9)

        scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, NUM_EPOCHS - warmup_epo, eta_min=0)
        scheduler = GradualWarmupScheduler(optimizer,
                                           multiplier=warmup_factor,
                                           total_epoch=warmup_epo,
                                           after_scheduler=scheduler_cosine)

    # ML Flow  ###########################################################################
    experient_name = f'PANDA_{cfg.data.model_name}'
    mlflow.set_experiment(experient_name)

    with mlflow.start_run():
        # パラメータを記録
        for k, v in cfg.data.items():
            mlflow.log_param('data/' + str(k), v)
        for k, v in cfg.training.items():
            mlflow.log_param('training/' + str(k), v)

        # Train  ################################################################
        writer = SummaryWriter(f'./tensorboard/{exp_name}')
        if cfg.training.multi_fold:
            trainer = Trainer_multifold(dataloaders,
                                        net,
                                        device,
                                        NUM_EPOCHS,
                                        criterion,
                                        optimizer,
                                        scheduler,
                                        exp=exp_name,
                                        writer=writer,
                                        save_weight_path='./weights',
                                        binning=cfg.training.binning)
        else:
            trainer = Trainer(dataloaders,
                              net,
                              device,
                              NUM_EPOCHS,
                              criterion,
                              optimizer,
                              scheduler,
                              exp=exp_name,
                              writer=writer,
                              save_weight_path='./weights',
                              binning=cfg.training.binning)
        trainer.train()
def get_optimizer(optimizer_name: str,
                  parameters,
                  learning_rate: float,
                  weight_decay=1e-5,
                  eps=1e-5,
                  **kwargs) -> Optimizer:
    from torch.optim import SGD, Adam, RMSprop, AdamW
    from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger

    if optimizer_name.lower() == "sgd":
        return SGD(parameters,
                   learning_rate,
                   momentum=0.9,
                   nesterov=True,
                   weight_decay=weight_decay,
                   **kwargs)

    if optimizer_name.lower() == "adam":
        return Adam(parameters,
                    learning_rate,
                    weight_decay=weight_decay,
                    eps=eps,
                    **kwargs)  # As Jeremy suggests

    if optimizer_name.lower() == "rms":
        return RMSprop(parameters,
                       learning_rate,
                       weight_decay=weight_decay,
                       **kwargs)

    if optimizer_name.lower() == "adamw":
        return AdamW(parameters,
                     learning_rate,
                     weight_decay=weight_decay,
                     eps=eps,
                     **kwargs)

    if optimizer_name.lower() == "radam":
        return RAdam(parameters,
                     learning_rate,
                     weight_decay=weight_decay,
                     eps=eps,
                     **kwargs)  # As Jeremy suggests

    # Optimizers from torch-optimizer
    if optimizer_name.lower() == "ranger":
        return Ranger(parameters,
                      learning_rate,
                      eps=eps,
                      weight_decay=weight_decay,
                      **kwargs)

    if optimizer_name.lower() == "lamb":
        return Lamb(parameters,
                    learning_rate,
                    eps=eps,
                    weight_decay=weight_decay,
                    **kwargs)

    if optimizer_name.lower() == "diffgrad":
        return DiffGrad(parameters,
                        learning_rate,
                        eps=eps,
                        weight_decay=weight_decay,
                        **kwargs)

    if optimizer_name.lower() == "novograd":
        return NovoGrad(parameters,
                        learning_rate,
                        eps=eps,
                        weight_decay=weight_decay,
                        **kwargs)

    # Optimizers from Apex (Fused version is faster on GPU with tensor cores)
    if optimizer_name.lower() == "fused_lamb":
        from apex.optimizers import FusedLAMB

        return FusedLAMB(parameters,
                         learning_rate,
                         eps=eps,
                         weight_decay=weight_decay,
                         **kwargs)

    if optimizer_name.lower() == "fused_sgd":
        from apex.optimizers import FusedSGD

        return FusedSGD(parameters,
                        learning_rate,
                        momentum=0.9,
                        nesterov=True,
                        weight_decay=weight_decay,
                        **kwargs)

    if optimizer_name.lower() == "fused_adam":
        from apex.optimizers import FusedAdam

        return FusedAdam(parameters,
                         learning_rate,
                         eps=eps,
                         weight_decay=weight_decay,
                         adam_w_mode=True,
                         **kwargs)

    raise ValueError("Unsupported optimizer name " + optimizer_name)
Пример #17
0
def main(
    data_dir,
    save_dir,
    total_steps,
    warmup_steps,
    valid_steps,
    log_steps,
    save_steps,
    milestones,
    exclusive_rate,
    n_samples,
    accu_steps,
    batch_size,
    n_workers,
    preload,
    comment,
    ckpt,
    grad_norm_clip,
    use_target_features,
    **kwargs,
):
    """Main function."""

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    metadata_path = Path(data_dir) / "metadata.json"

    dataset = IntraSpeakerDataset(data_dir,
                                  metadata_path,
                                  n_samples,
                                  preload,
                                  ref_feat=use_target_features)
    trainlen = int(0.9 * len(dataset))
    lengths = [trainlen, len(dataset) - trainlen]
    trainset, validset = random_split(dataset, lengths)
    train_loader = DataLoader(
        trainset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=n_workers,
        pin_memory=True,
        collate_fn=collate_batch,
    )
    valid_loader = DataLoader(
        validset,
        batch_size=batch_size * accu_steps,
        num_workers=n_workers,
        drop_last=True,
        pin_memory=True,
        collate_fn=collate_batch,
    )
    train_iterator = iter(train_loader)

    if comment is not None:
        log_dir = "logs/"
        log_dir += datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
        log_dir += "_" + comment
        writer = SummaryWriter(log_dir)

    save_dir_path = Path(save_dir)
    save_dir_path.mkdir(parents=True, exist_ok=True)

    if ckpt is not None:
        try:
            start_step = int(ckpt.split('-')[1][4:])
            ref_included = True
        except:
            start_step = 0
            ref_included = False

        model = torch.jit.load(ckpt).to(device)
        optimizer = RAdam(
            [
                {
                    "params": model.unet.parameters(),
                    "lr": 1e-6
                },
                {
                    "params": model.smoothers.parameters()
                },
                {
                    "params": model.mel_linear.parameters()
                },
                {
                    "params": model.post_net.parameters()
                },
            ],
            lr=1e-4,
        )
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps,
                                                    total_steps - start_step)
        print("Optimizer and scheduler restarted.")
        print(f"Model loaded from {ckpt}, iteration: {start_step}")
    else:
        ref_included = False
        start_step = 0

        model = FragmentVC().to(device)
        model = torch.jit.script(model)
        optimizer = RAdam(model.parameters(), lr=1e-4)
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps,
                                                    total_steps)

    criterion = nn.L1Loss()

    best_loss = float("inf")
    best_state_dict = None

    self_exclude = 0.0

    pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step")

    for step in range(start_step, total_steps):
        batch_loss = 0.0

        for _ in range(accu_steps):
            try:
                batch = next(train_iterator)
            except StopIteration:
                train_iterator = iter(train_loader)
                batch = next(train_iterator)

            loss = model_fn(batch, model, criterion, self_exclude,
                            ref_included, device)
            loss = loss / accu_steps
            batch_loss += loss.item()
            loss.backward()

        optimizer.step()
        scheduler.step()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm_clip)
        optimizer.zero_grad()

        pbar.update()
        pbar.set_postfix(loss=f"{batch_loss:.2f}",
                         excl=self_exclude,
                         step=step + 1)

        if step % log_steps == 0 and comment is not None:
            writer.add_scalar("Loss/train", batch_loss, step)
            writer.add_scalar("Self-exclusive Rate", self_exclude, step)

        if (step + 1) % valid_steps == 0:
            pbar.close()

            valid_loss = valid(valid_loader, model, criterion, device)

            if comment is not None:
                writer.add_scalar("Loss/valid", valid_loss, step + 1)

            if valid_loss < best_loss:
                best_loss = valid_loss
                best_state_dict = model.state_dict()

            pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step")

        if (step + 1) % save_steps == 0 and best_state_dict is not None:
            loss_str = f"{best_loss:.4f}".replace(".", "dot")
            best_ckpt_name = f"retriever-best-loss{loss_str}.pt"

            loss_str = f"{valid_loss:.4f}".replace(".", "dot")
            curr_ckpt_name = f"retriever-step{step+1}-loss{loss_str}.pt"

            current_state_dict = model.state_dict()
            model.cpu()

            model.load_state_dict(best_state_dict)
            model.save(str(save_dir_path / best_ckpt_name))

            model.load_state_dict(current_state_dict)
            model.save(str(save_dir_path / curr_ckpt_name))

            model.to(device)
            pbar.write(
                f"Step {step + 1}, best model saved. (loss={best_loss:.4f})")

        if (step + 1) >= milestones[1]:
            self_exclude = exclusive_rate

        elif (step + 1) == milestones[0]:
            ref_included = True
            optimizer = RAdam(
                [
                    {
                        "params": model.unet.parameters(),
                        "lr": 1e-6
                    },
                    {
                        "params": model.smoothers.parameters()
                    },
                    {
                        "params": model.mel_linear.parameters()
                    },
                    {
                        "params": model.post_net.parameters()
                    },
                ],
                lr=1e-4,
            )
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, warmup_steps, total_steps - milestones[0])
            pbar.write("Optimizer and scheduler restarted.")

        elif (step + 1) > milestones[0]:
            self_exclude = (step + 1 - milestones[0]) / (milestones[1] -
                                                         milestones[0])
            self_exclude *= exclusive_rate

    pbar.close()
Пример #18
0
def get_optimizer(net, opt_conf, tasks=None, is_disc=False, iterations=-1):
    """Returns a tuple (optimizer, scheduler) according to opt_conf which
    should come from the trainer's opts as: trainer.opts.<model>.opt

    Args:
        net (nn.Module): Network to update
        opt_conf (addict.Dict): optimizer and scheduler options
        tasks: list of tasks
        iterations (int, optional): Last epoch number. Defaults to -1, meaning
            start with base lr.

    Returns:
        Tuple: (torch.Optimizer, torch._LRScheduler)
    """
    opt = scheduler = None
    lr_names = []
    if tasks is None:
        lr_default = opt_conf.lr
        params = net.parameters()
        lr_names.append("full")
    elif isinstance(opt_conf.lr, float):  # Use default for all tasks
        lr_default = opt_conf.lr
        params = net.parameters()
        lr_names.append("full")
    elif len(opt_conf.lr) == 1:  # Use default for all tasks
        lr_default = opt_conf.lr.default
        params = net.parameters()
        lr_names.append("full")
    else:
        lr_default = opt_conf.lr.default
        params = list()
        for task in tasks:
            lr = opt_conf.lr.get(task, lr_default)
            parameters = None
            # Parameters for encoder
            if not is_disc:
                if task == "m":
                    parameters = net.encoder.parameters()
                    params.append({"params": parameters, "lr": lr})
                    lr_names.append("encoder")
                # Parameters for decoders
                if task == "p":
                    if hasattr(net, "painter"):
                        parameters = net.painter.parameters()
                        lr_names.append("painter")
                else:
                    parameters = net.decoders[task].parameters()
                    lr_names.append(f"decoder_{task}")
            else:
                if task in net:
                    parameters = net[task].parameters()
                    lr_names.append(f"disc_{task}")

            if parameters is not None:
                params.append({"params": parameters, "lr": lr})

    if opt_conf.optimizer.lower() == "extraadam":
        opt = ExtraAdam(params, lr=lr_default, betas=(opt_conf.beta1, 0.999))
    elif opt_conf.optimizer.lower() == "novograd":
        opt = NovoGrad(params, lr=lr_default,
                       betas=(opt_conf.beta1, 0))  # default for beta2 is 0
    elif opt_conf.optimizer.lower() == "radam":
        opt = RAdam(params, lr=lr_default, betas=(opt_conf.beta1, 0.999))
    elif opt_conf.optimizer.lower() == "rmsprop":
        opt = RMSprop(params, lr=lr_default)
    else:
        opt = Adam(params, lr=lr_default, betas=(opt_conf.beta1, 0.999))
    scheduler = get_scheduler(opt, opt_conf, iterations)
    return opt, scheduler, lr_names
Пример #19
0
def train(rank: int, cfg: DictConfig):
    print(OmegaConf.to_yaml(cfg))

    if cfg.train.n_gpu > 1:
        init_process_group(backend=cfg.train.dist_config['dist_backend'],
                           init_method=cfg.train.dist_config['dist_url'],
                           world_size=cfg.train.dist_config['world_size'] *
                           cfg.train.n_gpu,
                           rank=rank)

    device = torch.device(
        'cuda:{:d}'.format(rank) if torch.cuda.is_available() else 'cpu')

    generator = Generator(sum(cfg.model.feature_dims), *cfg.model.cond_dims,
                          **cfg.model.generator).to(device)
    discriminator = Discriminator(**cfg.model.discriminator).to(device)

    if rank == 0:
        print(generator)
        os.makedirs(cfg.train.ckpt_dir, exist_ok=True)
        print("checkpoints directory : ", cfg.train.ckpt_dir)

    if os.path.isdir(cfg.train.ckpt_dir):
        cp_g = scan_checkpoint(cfg.train.ckpt_dir, 'g_')
        cp_do = scan_checkpoint(cfg.train.ckpt_dir, 'd_')

    steps = 1
    if cp_g is None or cp_do is None:
        state_dict_do = None
        last_epoch = -1
    else:
        state_dict_g = load_checkpoint(cp_g, device)
        state_dict_do = load_checkpoint(cp_do, device)
        generator.load_state_dict(state_dict_g['generator'])
        discriminator.load_state_dict(state_dict_do['discriminator'])
        steps = state_dict_do['steps'] + 1
        last_epoch = state_dict_do['epoch']

    if cfg.train.n_gpu > 1:
        generator = DistributedDataParallel(generator,
                                            device_ids=[rank]).to(device)
        discriminator = DistributedDataParallel(discriminator,
                                                device_ids=[rank]).to(device)

    optim_g = RAdam(generator.parameters(), cfg.opt.lr, betas=cfg.opt.betas)
    optim_d = RAdam(discriminator.parameters(),
                    cfg.opt.lr,
                    betas=cfg.opt.betas)

    if state_dict_do is not None:
        optim_g.load_state_dict(state_dict_do['optim_g'])
        optim_d.load_state_dict(state_dict_do['optim_d'])

    scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
        optim_g, gamma=cfg.opt.lr_decay, last_epoch=last_epoch)
    scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
        optim_d, gamma=cfg.opt.lr_decay, last_epoch=last_epoch)

    train_filelist = load_dataset_filelist(cfg.dataset.train_list)
    trainset = FeatureDataset(cfg.dataset, train_filelist, cfg.data)
    train_sampler = DistributedSampler(
        trainset) if cfg.train.n_gpu > 1 else None
    train_loader = DataLoader(trainset,
                              batch_size=cfg.train.batch_size,
                              num_workers=cfg.train.num_workers,
                              shuffle=True,
                              sampler=train_sampler,
                              pin_memory=True,
                              drop_last=True)

    if rank == 0:
        val_filelist = load_dataset_filelist(cfg.dataset.test_list)
        valset = FeatureDataset(cfg.dataset,
                                val_filelist,
                                cfg.data,
                                segmented=False)
        val_loader = DataLoader(valset,
                                batch_size=1,
                                num_workers=cfg.train.num_workers,
                                shuffle=False,
                                sampler=train_sampler,
                                pin_memory=True)

        sw = SummaryWriter(os.path.join(cfg.train.ckpt_dir, 'logs'))

    generator.train()
    discriminator.train()
    for epoch in range(max(0, last_epoch), cfg.train.epochs):
        if rank == 0:
            start = time.time()
            print("Epoch: {}".format(epoch + 1))

        if cfg.train.n_gpu > 1:
            train_sampler.set_epoch(epoch)

        for y, x_noised_features, x_noised_cond in train_loader:
            if rank == 0:
                start_b = time.time()

            y = y.to(device, non_blocking=True)
            x_noised_features = x_noised_features.transpose(1, 2).to(
                device, non_blocking=True)
            x_noised_cond = x_noised_cond.to(device, non_blocking=True)
            z1 = torch.randn(cfg.train.batch_size,
                             cfg.model.cond_dims[1],
                             device=device)
            z2 = torch.randn(cfg.train.batch_size,
                             cfg.model.cond_dims[1],
                             device=device)

            y_hat1 = generator(x_noised_features, x_noised_cond, z=z1)
            y_hat2 = generator(x_noised_features, x_noised_cond, z=z2)

            # Discriminator
            real_scores, fake_scores = discriminator(y), discriminator(
                y_hat1.detach())
            d_loss = discriminator_loss(real_scores, fake_scores)

            optim_d.zero_grad()
            d_loss.backward(retain_graph=True)
            optim_d.step()

            # Generator
            g_stft_loss = criterion(y, y_hat1) + criterion(
                y, y_hat2) - criterion(y_hat1, y_hat2)
            g_adv_loss = adversarial_loss(fake_scores)
            g_loss = g_adv_loss + g_stft_loss

            optim_g.zero_grad()
            g_loss.backward()
            optim_g.step()

            if rank == 0:
                # STDOUT logging
                if steps % cfg.train.stdout_interval == 0:
                    with torch.no_grad():
                        print(
                            'Steps : {:d}, Gen Loss Total : {:4.3f}, STFT Error : {:4.3f}, s/b : {:4.3f}'
                            .format(steps, g_loss, g_stft_loss,
                                    time.time() - start_b))

                # checkpointing
                if steps % cfg.train.checkpoint_interval == 0:
                    ckpt_dir = "{}/g_{:08d}".format(cfg.train.ckpt_dir, steps)
                    save_checkpoint(
                        ckpt_dir, {
                            'generator':
                            (generator.module if cfg.train.n_gpu > 1 else
                             generator).state_dict()
                        })
                    ckpt_dir = "{}/do_{:08d}".format(cfg.train.ckpt_dir, steps)
                    save_checkpoint(
                        ckpt_dir, {
                            'discriminator':
                            (discriminator.module if cfg.train.n_gpu > 1 else
                             discriminator).state_dict(),
                            'optim_g':
                            optim_g.state_dict(),
                            'optim_d':
                            optim_d.state_dict(),
                            'steps':
                            steps,
                            'epoch':
                            epoch
                        })

                # Tensorboard summary logging
                if steps % cfg.train.summary_interval == 0:
                    sw.add_scalar("training/gen_loss_total", g_loss, steps)
                    sw.add_scalar("training/gen_stft_error", g_stft_loss,
                                  steps)

                # Validation
                if steps % cfg.train.validation_interval == 0:
                    generator.eval()
                    torch.cuda.empty_cache()
                    val_err_tot = 0
                    with torch.no_grad():
                        for j, (y, x_noised_features,
                                x_noised_cond) in enumerate(val_loader):
                            y_hat = generator(
                                x_noised_features.transpose(1, 2).to(device),
                                x_noised_cond.to(device))
                            val_err_tot += criterion(y, y_hat).item()

                            if j <= 4:
                                # sw.add_audio('noised/y_noised_{}'.format(j), y_noised[0], steps, cfg.data.target_sample_rate)
                                sw.add_audio('generated/y_hat_{}'.format(j),
                                             y_hat[0], steps,
                                             cfg.data.sample_rate)
                                sw.add_audio('gt/y_{}'.format(j), y[0], steps,
                                             cfg.data.sample_rate)

                        val_err = val_err_tot / (j + 1)
                        sw.add_scalar("validation/stft_error", val_err, steps)

                    generator.train()

            steps += 1

        scheduler_g.step()
        scheduler_d.step()

        if rank == 0:
            print('Time taken for epoch {} is {} sec\n'.format(
                epoch + 1, int(time.time() - start)))