示例#1
0
def get_model():
    # mask_model.py에 정의된 특정 모델을 가져옵니다.
    model_module = getattr(import_module("recycle_model"), CFG.model)
    model = model_module(num_classes=12)

    # 모델의 파라미터를 GPU메모리로 옮깁니다.
    model.cuda()

    # wandb에서 model 감독
    wandb.watch(model)

    # 모델의 파라미터 수를 출력합니다.
    print('parameters: ',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    # GPU가 2개 이상이면 데이터패러럴로 학습 가능하게 만듭니다.
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # loss.py에 정의된 criterion을 가져옵니다.
    criterion = create_criterion(CFG.criterion)

    # optimizer.py에 정의된 optimizer를 가져옵니다.
    optimizer_encoder = create_optimizer(
        CFG.optimizer, params=model.seg_model.encoder.parameters(), lr=1e-8)

    optimizer_decoder = create_optimizer(
        CFG.optimizer,
        params=[{
            "params": model.seg_model.decoder.parameters()
        }, {
            "params": model.seg_model.segmentation_head.parameters()
        }],
        lr=1e-8)

    # scheduler.py에 정의된 scheduler를 가져옵니다.
    scheduler_encoder = create_scheduler(CFG.scheduler,
                                         optimizer=optimizer_encoder,
                                         T_0=30,
                                         T_mult=2,
                                         eta_max=CFG.learning_rate * 0.1,
                                         T_up=5,
                                         gamma=0.3)

    scheduler_decoder = create_scheduler(CFG.scheduler,
                                         optimizer=optimizer_decoder,
                                         T_0=30,
                                         T_mult=2,
                                         eta_max=CFG.learning_rate,
                                         T_up=5,
                                         gamma=0.3)

    return model, criterion, optimizer_encoder, optimizer_decoder, scheduler_encoder, scheduler_decoder
示例#2
0
def main(args):
    wandb.init(project="stage-1", reinit=True)
    wandb.run.name = args.MODEL
    wandb.config.update(args)

    args = wandb.config

    train_loader, val_loader = get_loader(args.BATCH_SIZE)
    print("Get loader")
    model = get_res_pre_trained(args.MODEL).to(args.device)
    print("Load model")

    wandb.watch(model)

    criterion = create_criterion(args.LOSS)
    optimizer = optim.Adam(model.parameters(), lr=args.LEARNING_RATE)
    print("Run")
    run(args, model, criterion, optimizer, train_loader, val_loader)
示例#3
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)

    save_dir = increment_path(os.path.join(model_dir, args.name))

    # -- settings
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # -- dataset
    dataset_module = getattr(import_module("dataset"),
                             args.dataset)  # default: BaseAugmentation
    dataset = dataset_module(data_dir=data_dir, )
    num_classes = dataset.num_classes  # 18

    # -- augmentation
    transform_module = getattr(import_module("dataset"),
                               args.augmentation)  # default: BaseAugmentation
    transform = transform_module(
        resize=args.resize,
        mean=dataset.mean,
        std=dataset.std,
    )
    dataset.set_transform(transform)

    # -- data_loader
    train_set, val_set = dataset.split_dataset()

    train_loader = DataLoader(
        train_set,
        batch_size=args.batch_size,
        num_workers=8,
        shuffle=True,
        pin_memory=use_cuda,
        drop_last=True,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=args.valid_batch_size,
        num_workers=8,
        shuffle=False,
        pin_memory=use_cuda,
        drop_last=True,
    )

    # -- model
    model_module = getattr(import_module("model"),
                           args.model)  # default: BaseModel
    model = model_module(num_classes=num_classes).to(device)
    model = torch.nn.DataParallel(model)

    # -- loss & metric
    criterion = create_criterion(args.criterion)  # default: cross_entropy
    opt_module = getattr(import_module("torch.optim"),
                         args.optimizer)  # default: SGD
    optimizer = opt_module(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=args.lr,
                           weight_decay=5e-4)
    scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)

    # -- logging
    logger = SummaryWriter(log_dir=save_dir)
    with open(os.path.join(save_dir, 'config.json'), 'w',
              encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    best_val_acc = 0
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        # train loop
        model.train()
        loss_value = 0
        matches = 0
        for idx, train_batch in enumerate(train_loader):
            inputs, labels = train_batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outs = model(inputs)
            preds = torch.argmax(outs, dim=-1)
            loss = criterion(outs, labels)

            loss.backward()
            optimizer.step()

            loss_value += loss.item()
            matches += (preds == labels).sum().item()
            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}"
                )
                logger.add_scalar("Train/loss", train_loss,
                                  epoch * len(train_loader) + idx)
                logger.add_scalar("Train/accuracy", train_acc,
                                  epoch * len(train_loader) + idx)

                loss_value = 0
                matches = 0

        scheduler.step()

        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()
            val_loss_items = []
            val_acc_items = []
            figure = None
            for val_batch in val_loader:
                inputs, labels = val_batch
                inputs = inputs.to(device)
                labels = labels.to(device)

                outs = model(inputs)
                preds = torch.argmax(outs, dim=-1)

                loss_item = criterion(outs, labels).item()
                acc_item = (labels == preds).sum().item()
                val_loss_items.append(loss_item)
                val_acc_items.append(acc_item)

                if figure is None:
                    inputs_np = torch.clone(inputs).detach().cpu().permute(
                        0, 2, 3, 1).numpy()
                    inputs_np = dataset_module.denormalize_image(
                        inputs_np, dataset.mean, dataset.std)
                    figure = grid_image(
                        inputs_np, labels, preds,
                        args.dataset != "MaskSplitByProfileDataset")

            val_loss = np.sum(val_loss_items) / len(val_loader)
            val_acc = np.sum(val_acc_items) / len(val_set)
            best_val_loss = min(best_val_loss, val_loss)
            if val_acc > best_val_acc:
                print(
                    f"New best model for val accuracy : {val_acc:4.2%}! saving the best model.."
                )
                torch.save(model.module.state_dict(), f"{save_dir}/best.pth")
                best_val_acc = val_acc
            torch.save(model.module.state_dict(), f"{save_dir}/last.pth")
            print(
                f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.2} || "
                f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}"
            )
            logger.add_scalar("Val/loss", val_loss, epoch)
            logger.add_scalar("Val/accuracy", val_acc, epoch)
            logger.add_figure("results", figure, epoch)
            print()
def get_model(train_iter):
    # get model from mask_model.py and define with parameters
    model_module = getattr(import_module("mask_model"), CFG.model)
    model = model_module()

    # Upload data to gpu memory
    model.cuda()    
    
    # print number of parameters(weights) of defined model
    print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
    
    # if exists more than 2 GPUs, use DataParallel training
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # get criterion from loss.py and define with parameters 
    criterion_mask = create_criterion(CFG.criterion, classes=3, smoothing=0.05)
    criterion_gender = create_criterion('cross_entropy')
    criterion_age = create_criterion(CFG.criterion, classes=3, smoothing=0.05)

    # get optimizer from optimizer.py and define with parameters 
    optimizer_backbone = create_optimizer(
        CFG.optimizer,
        params=model.backbone.parameters(),
        lr = CFG.learning_rate * 0.1,
        momentum=0.9,
        weight_decay=1e-2
    )
    optimizer_classifier = create_optimizer(
        CFG.optimizer,
        params=[
            {"params": model.mask_layer.parameters()},
            {"params": model.gender_layer.parameters()},
            {"params": model.age_layer.parameters()},
        ],
        lr = CFG.learning_rate,
        momentum=0.9,
        weight_decay=1e-2
    )

    # get scheduler from scheduler.py and define with parameters 
    scheduler_backbone = create_scheduler(
        CFG.scheduler,
        optimizer=optimizer_backbone,
        max_lr=CFG.learning_rate * 0.1,
        epochs=CFG.nepochs,
        steps_per_epoch=len(train_iter),
        pct_start=5/CFG.nepochs,
        anneal_strategy='cos'
    )
    scheduler_classifier = create_scheduler(
        CFG.scheduler,
        optimizer=optimizer_classifier,
        max_lr=CFG.learning_rate,
        epochs=CFG.nepochs,
        steps_per_epoch=len(train_iter),
        pct_start=5/CFG.nepochs,
        anneal_strategy='cos'
    )

    return model, criterion_mask, criterion_gender, criterion_age, optimizer_backbone, optimizer_classifier, scheduler_backbone, scheduler_classifier
示例#5
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)

    save_dir = increment_path(os.path.join(model_dir, args.name))

    # -- settings
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # -- dataset
    dataset_module = getattr(import_module("dataset"),
                             args.dataset)  # MaskBaseDataset
    dataset = dataset_module(data_dir=data_dir, val_ratio=args.val_ratio)
    num_classes = dataset.num_classes  # 18

    # -- augmentation
    transform_module = getattr(import_module("dataset"),
                               args.augmentation)  # default: BaseAugmentation
    transform = transform_module(
        resize=args.resize,
        mean=dataset.mean,
        std=dataset.std,
    )
    dataset.set_transform(transform)

    # -- data_loader
    train_set, val_set = dataset.split_dataset()

    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              num_workers=8,
                              shuffle=True,
                              pin_memory=use_cuda,
                              drop_last=True)

    val_loader = DataLoader(val_set,
                            batch_size=args.batch_size,
                            num_workers=8,
                            shuffle=False,
                            pin_memory=use_cuda,
                            drop_last=True)

    # -- model
    models = []
    model_module_gender = getattr(import_module("model"),
                                  args.model_gender)  # default: BaseModel
    model_gender = model_module_gender(num_classes=args.num_classes_gender,
                                       grad_point=args.grad_point).to(device)
    model_gender = torch.nn.DataParallel(model_gender)

    # -- loss & metric

    criterion_gender = create_criterion(
        args.criterion_gender, classes=args.num_classes_gender)  # default: f1
    if args.optimizer == "AdamP":
        optimizer_gender = AdamP(filter(lambda p: p.requires_grad,
                                        model_gender.parameters()),
                                 lr=args.lr,
                                 weight_decay=5e-4)
    else:
        opt_module = getattr(import_module('torch.optim'),
                             args.optimizer)  # default: Adam
        optimizer_gender = opt_module(filter(lambda p: p.requires_grad,
                                             model_gender.parameters()),
                                      lr=args.lr,
                                      weight_decay=5e-4)
    scheduler_gender = StepLR(optimizer_gender, args.lr_decay_step, gamma=0.5)

    # -- logging
    logger_gender = SummaryWriter(log_dir=os.path.join(save_dir, 'gender'))
    with open(Path(save_dir) / 'gender' / 'config.json', 'w',
              encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    best_val_acc_gender = 0
    best_val_loss_gender = np.inf
    for epoch in range(args.epochs):
        # train loop
        model_gender.train()
        loss_value_gender = 0
        matches_gender = 0
        for idx, train_batch in enumerate(train_loader):
            inputs, labels_mask, labels_gender, labels_age = train_batch
            inputs = inputs.to(device)
            labels_gender = labels_gender.to(device)

            optimizer_gender.zero_grad()

            outs_gender = model_gender(inputs)
            preds_gender = torch.argmax(outs_gender, dim=-1)
            loss_gender = criterion_gender(outs_gender, labels_gender)

            loss_gender.backward()
            optimizer_gender.step()

            loss_value_gender += loss_gender.item()
            matches_gender += (preds_gender == labels_gender).sum().item()
            if (idx + 1) % args.log_interval == 0:
                train_loss_gender = loss_value_gender / args.log_interval
                train_acc_gender = matches_gender / args.batch_size / args.log_interval
                current_lr_gender = get_lr(optimizer_gender)
                print(
                    f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss_gender:4.4} || training accuracy {train_acc_gender:4.2%} || lr {current_lr_gender}"
                )
                logger_gender.add_scalar("Train/loss", train_loss_gender,
                                         epoch * len(train_loader) + idx)
                logger_gender.add_scalar("Train/accuracy", train_acc_gender,
                                         epoch * len(train_loader) + idx)

                loss_value_gender = 0
                matches_gender = 0

        scheduler_gender.step()

        #val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model_gender.eval()
            val_loss_items_gender = []
            val_acc_items_gender = []
            figure = None
            for val_batch in val_loader:
                inputs, labels_mask, labels_gender, labels_age = val_batch
                inputs = inputs.to(device)
                labels_gender = labels_gender.to(device)

                outs_gender = model_gender(inputs)
                preds_gender = torch.argmax(outs_gender, dim=-1)

                loss_item_gender = criterion_gender(outs_gender,
                                                    labels_gender).item()
                acc_item_gender = (labels_gender == preds_gender).sum().item()
                val_loss_items_gender.append(loss_item_gender)
                val_acc_items_gender.append(acc_item_gender)

                if figure is None:
                    # inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy()
                    inputs_np = torch.clone(inputs).detach().cpu()
                    inputs_np = inputs_np.permute(0, 2, 3, 1).numpy()
                    inputs_np = dataset_module.denormalize_image(
                        inputs_np, dataset.mean, dataset.std)
                    figure = grid_image(
                        inputs_np, labels_mask, preds_gender,
                        args.dataset != "MaskSplitByProfileDataset")
                    plt.show()

            val_loss_gender = np.sum(val_loss_items_gender) / len(val_loader)
            val_acc_gender = np.sum(val_acc_items_gender) / len(val_set)
            if val_loss_gender < best_val_loss_gender or val_acc_gender > best_val_acc_gender:
                save_model(model_gender, epoch,
                           val_loss_gender, val_acc_gender,
                           os.path.join(save_dir, "gender"), args.model_gender)
                if val_loss_gender < best_val_loss_gender and val_acc_gender > best_val_acc_gender:
                    print(
                        f"New best model_gender for val acc and val loss : {val_acc_gender:4.2%} {val_loss_gender:4.2}! saving the best model_gender.."
                    )
                    best_val_loss_gender = val_loss_gender
                    best_val_acc_gender = val_acc_gender
                elif val_loss_gender < best_val_loss_gender:
                    print(
                        f"New best model_gender for val loss : {val_loss_gender:4.2}! saving the best model_gender.."
                    )
                    best_val_loss_gender = val_loss_gender
                elif val_acc_gender > best_val_acc_gender:
                    print(
                        f"New best model_gender for val accuracy : {val_acc_gender:4.2%}! saving the best model_gender.."
                    )
                    best_val_acc_gender = val_acc_gender

            print(
                f"[Val] acc: {val_acc_gender:4.2%}, loss: {val_loss_gender:4.2} || "
                f"best acc: {best_val_acc_gender:4.2%}, best loss: {best_val_loss_gender:4.2}"
            )
            logger_gender.add_scalar("Val/loss", val_loss_gender, epoch)
            logger_gender.add_scalar("Val/accuracy", val_acc_gender, epoch)
            logger_gender.add_figure("results", figure, epoch)
            print()
示例#6
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)
    # args.__dict__ == vars(args)

    save_dir = increment_path(os.path.join(model_dir, args.name))

    # -- settings
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # -- dataset
    dataset_module = getattr(import_module("dataset"),
                             args.dataset)  # MaskBaseDataset
    dataset = dataset_module(data_dir=data_dir, val_ratio=args.val_ratio)
    num_classes = dataset.num_classes  # 18

    # -- augmentation
    transform_module = getattr(import_module("dataset"),
                               args.augmentation)  # default: BaseAugmentation
    transform = transform_module(
        resize=args.resize,
        mean=dataset.mean,
        std=dataset.std,
    )
    dataset.set_transform(transform)

    # -- data_loader
    train_set, val_set = dataset.split_dataset()

    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              num_workers=8,
                              shuffle=True,
                              pin_memory=use_cuda,
                              drop_last=True)

    val_loader = DataLoader(val_set,
                            batch_size=args.batch_size,
                            num_workers=8,
                            shuffle=False,
                            pin_memory=use_cuda,
                            drop_last=True)

    # -- model
    model_module = getattr(import_module("model"),
                           args.model)  # default: BaseModel
    model = model_module(num_classes=num_classes,
                         grad_point=args.grad_point).to(device)
    model = torch.nn.DataParallel(model)
    # if want model train begin from args.continue_epoch checkpoint.
    if args.continue_train:
        try_dir = find_dir_try(args.continue_try_num, model_dir,
                               args.continue_name)
        epoch_dir = find_dir_epoch(args.continue_epoch, try_dir)
        model.load_state_dict(torch.load(epoch_dir))

    # -- loss & metric
    if args.criterion == "cross_entropy":
        criterion = create_criterion(args.criterion)  # default: cross_entropy
    else:
        criterion = create_criterion(
            args.criterion, classes=num_classes)  # default: cross_entropy
    if args.optimizer == "AdamP":
        optimizer = AdamP(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.lr,
                          weight_decay=5e-4)
    else:
        opt_module = getattr(import_module('torch.optim'),
                             args.optimizer)  # default: Adam
        optimizer = opt_module(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr,
                               weight_decay=5e-4)
    scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)

    # -- logging
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    with open(Path(save_dir) / 'config.json', 'w', encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    best_val_acc = 0
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        # train loop
        model.train()
        loss_value = 0
        matches = 0
        for idx, train_batch in enumerate(train_loader):
            inputs, labels = train_batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outs = model(inputs)
            preds = torch.argmax(outs, dim=-1)
            loss = criterion(outs, labels)

            loss.backward()
            optimizer.step()

            loss_value += loss.item()
            matches += (preds == labels).sum().item()
            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}"
                )

                loss_value = 0
                matches = 0

        scheduler.step()

        #val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()
            val_loss_items = []
            val_acc_items = []
            figure = None
            for val_batch in val_loader:
                inputs, labels = val_batch
                inputs = inputs.to(device)
                labels = labels.to(device)

                outs = model(inputs)
                preds = torch.argmax(outs, dim=-1)

                loss_item = criterion(outs, labels).item()
                acc_item = (labels == preds).sum().item()
                val_loss_items.append(loss_item)
                val_acc_items.append(acc_item)

                if figure is None:
                    # inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy()
                    inputs_np = torch.clone(inputs).detach().cpu()
                    inputs_np = inputs_np.permute(0, 2, 3, 1).numpy()
                    inputs_np = dataset_module.denormalize_image(
                        inputs_np, dataset.mean, dataset.std)
                    figure = grid_image(
                        inputs_np, labels, preds,
                        args.dataset != "MaskSplitByProfileDataset")
                    plt.show()

            val_loss = np.sum(val_loss_items) / len(val_loader)
            val_acc = np.sum(val_acc_items) / len(val_set)
            if val_loss < best_val_loss or val_acc > best_val_acc:
                save_model(model, epoch, val_loss, val_acc, save_dir,
                           args.model)
                if val_loss < best_val_loss and val_acc > best_val_acc:
                    print(
                        f"New best model for val acc and val loss : {val_acc:4.2%} {val_loss:4.2}! saving the best model.."
                    )
                    best_val_loss = val_loss
                    best_val_acc = val_acc
                elif val_loss < best_val_loss:
                    print(
                        f"New best model for val loss : {val_loss:4.2}! saving the best model.."
                    )
                    save_model(model, epoch, val_loss, val_acc, save_dir,
                               args.model)
                    best_val_loss = val_loss
                elif val_acc > best_val_acc:
                    print(
                        f"New best model for val accuracy : {val_acc:4.2%}! saving the best model.."
                    )
                    save_model(model, epoch, val_loss, val_acc, save_dir,
                               args.model)
                    best_val_acc = val_acc

            print(
                f"[Val] acc: {val_acc:4.2%}, loss: {val_loss:4.2} || "
                f"best acc: {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}"
            )
            print()
示例#7
0
    def train(self):
        train_sampler = RandomSampler(self.train_dataset)
        train_dataloader = DataLoader(
            self.train_dataset,
            sampler=train_sampler,
            batch_size=self.args.train_batch_size,
        )

        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            self.args.num_train_epochs = (
                self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
            )
        else:
            t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.args.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.args.learning_rate,
            eps=self.args.adam_epsilon,
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.args.warmup_steps,
            num_training_steps=t_total,
        )

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(self.train_dataset))
        logger.info("  Num Epochs = %d", self.args.num_train_epochs)
        logger.info("  Total train batch size = %d", self.args.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)
        logger.info("  Logging steps = %d", self.args.logging_steps)
        logger.info("  Save steps = %d", self.args.save_steps)

        global_step = 0
        tr_loss = 0.0
        self.model.zero_grad()

        train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")

        criterion1 = create_criterion('cross_entropy')
        criterion2 = create_criterion('f1')
        criterion3 = create_criterion('focal')
        criterion4 = create_criterion('label_smoothing')


        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                # batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU
                # inputs = {
                #     "input_ids": batch[0],
                #     "attention_mask": batch[1],
                #     "token_type_ids": batch[2],
                #     "labels": batch[3],
                #     "e1_mask": batch[4],
                #     "e2_mask": batch[5],
                # }
                # outputs = self.model(**inputs)
                # loss = outputs[0]

                batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU

                outputs = self.model(input_ids=batch[0],
                                     attention_mask=batch[1],
                                     e1_mask=batch[4],
                                     e2_mask=batch[5])
                _, preds = torch.max(outputs, 1)
                loss1 = criterion3(outputs, batch[3])
                loss2 = criterion4(outputs, batch[3])
                loss = loss1 + loss2

                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

                    if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
                        self.evaluate("test")  # There is no dev set for semeval task

                    # if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
                    #     self.save_model()

                if 0 < self.args.max_steps < global_step:
                    epoch_iterator.close()
                    break

            if 0 < self.args.max_steps < global_step:
                train_iterator.close()
                break

        return global_step, tr_loss / global_step
示例#8
0
    def evaluate(self, mode):
        # We use test dataset because semeval doesn't have dev dataset
        if mode == "test":
            dataset = self.test_dataset
        elif mode == "dev":
            dataset = self.dev_dataset
        else:
            raise Exception("Only dev and test dataset available")

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation on %s dataset *****", mode)
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        criterion1 = create_criterion('cross_entropy')
        criterion2 = create_criterion('f1')
        criterion3 = create_criterion('focal')
        criterion4 = create_criterion('label_smoothing')
        self.model.eval()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                # inputs = {
                #     "input_ids": batch[0],
                #     "attention_mask": batch[1],
                #     "token_type_ids": batch[2],
                #     "labels": batch[3],
                #     "e1_mask": batch[4],
                #     "e2_mask": batch[5],
                # }
                # outputs = self.model(**inputs)
                # tmp_eval_loss, logits = outputs[:2]

                # print(batch)
                logits = self.model(input_ids=batch[0],
                                     attention_mask=batch[1],
                                     e1_mask=batch[4],
                                     e2_mask=batch[5])

                loss1 = criterion3(logits, batch[3])
                loss2 = criterion4(logits, batch[3])
                tmp_eval_loss = loss1 + loss2

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                # out_label_ids = inputs["labels"].detach().cpu().numpy()
                out_label_ids = batch[3].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                # out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, batch[3].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        results = {"loss": eval_loss}
        preds = np.argmax(preds, axis=1)
        write_prediction(self.args, os.path.join(self.args.eval_dir, "proposed_answers.txt"), preds)

        result = compute_metrics(preds, out_label_ids)
        print(f'evaluate acc:{result}')
        results.update(result)

        logger.info("***** Eval results *****")
        for key in sorted(results.keys()):
            logger.info("  {} = {:.4f}".format(key, results[key]))

        return results
示例#9
0
def main(args):
    seed_everything(21)
    load_dotenv()

    if WANDB:
        if args.ENCODER:
            run_name = args.MODEL + "_" + args.ENCODER
        else:
            run_name = args.MODEL

    if args.KFOLD > 1:
        if args.KFOLD != 5:
            print("Only 5 KFOLD is available")
            return

        # pt 저장 폴더 생성
        path_pair = args.MODEL_PATH.split(".")
        os.makedirs(path_pair[0], exist_ok=True)
        # 재사용위해 args 복사
        args_origin = copy.deepcopy(args)

    for fold in range(args.KFOLD):
        # hold-out, kfold에 따라서 dataloader 다르게 설정
        if args.KFOLD > 1:
            args = copy.deepcopy(args_origin)
            path_pair = args_origin.MODEL_PATH.split(".")
            # MODEL_PATH 변경
            args.MODEL_PATH = (path_pair[0] + f"/kfold_{fold+1}." +
                               path_pair[1])
            # wandb
            if WANDB:
                wandb.init(
                    project=os.environ.get("WANDB_PROJECT_NAME"),
                    name=run_name + f"_k{fold+1}",
                    config=args,
                    reinit=True,
                )
                args = wandb.config
            # dataloader
            dataloader = get_dataloader(args.BATCH_SIZE, fold_index=fold)
            print(f"\nfold {fold+1} start")
        else:
            # wandb
            if WANDB:
                wandb.init(
                    project=os.environ.get("WANDB_PROJECT_NAME"),
                    name=run_name,
                    reinit=True,
                )
                wandb.config.update(args)
                args = wandb.config
            # dataloader
            dataloader = get_dataloader(args.BATCH_SIZE)
        print("Get loader")

        model = get_model(args.MODEL, args.ENCODER).to(args.device)
        print("Load model")

        if WANDB:
            wandb.watch(model)

        criterion = []
        if "+" in args.LOSS:
            criterion.append("+")
            criterion.append(create_criterion(args.LOSS.split("+")[0]))
            criterion.append(create_criterion(args.LOSS.split("+")[1]))
        elif "-" in args.LOSS:
            criterion.append("-")
            criterion.append(create_criterion(args.LOSS.split("-")[0]))
            criterion.append(create_criterion(args.LOSS.split("-")[1]))
        else:
            criterion.append("0")
            criterion.append(create_criterion(args.LOSS))
        optimizer = create_optimizer(args.OPTIMIZER, model, args.LEARNING_RATE)
        if args.SCHEDULER:
            scheduler = create_scheduler(args.SCHEDULER, optimizer)
        else:
            scheduler = None
        # optimizer = optim.Adam(params = model.parameters(), lr = args.LEARNING_RATE, weight_decay=1e-6)

        print("Run")
        run(args, model, criterion, optimizer, dataloader, fold, scheduler)
示例#10
0
def get_model():
    # model.py에 정의된 특정 모델을 가져옵니다.
    model_module = getattr(import_module("recycle_model"), CFG.model)
    model = model_module(num_classes=12)

    # 모델의 파라미터를 GPU메모리로 옮깁니다.
    model.cuda()

    # wandb에서 model 감독
    wandb.watch(model)

    # 모델의 파라미터 수를 출력합니다.
    print('parameters: ',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    # GPU가 2개 이상이면 데이터패러럴로 학습 가능하게 만듭니다.
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # loss.py에 정의된 criterion을 가져옵니다.
    criterion = create_criterion(CFG.criterion)

    # optimizer.py에 정의된 optimizer를 가져옵니다.
    if CFG.optimizer == "Adam":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate,
            weight_decay=1e-6)
    elif CFG.optimizer == "RAdam":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=0)
    elif CFG.optimizer == "AdamP":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=0)
    elif CFG.optimizer == "AdamW":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate,
            amsgrad=True)
    elif CFG.optimizer == "RMSprop":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate)

    # scheduler.py에 정의된 scheduler를 가져옵니다.
    if CFG.scheduler == "StepLR":
        scheduler = create_scheduler(CFG.scheduler,
                                     optimizer=optimizer,
                                     step_size=5,
                                     gamma=0.95)
    elif CFG.scheduler == "CosineAnnealingWarmupRestarts":
        scheduler = create_scheduler(
            CFG.scheduler,
            optimizer=optimizer,
            first_cycle_steps=5,
            cycle_mult=1.,
            max_lr=1e-4,
            min_lr=1e-7,
        )

    return model, criterion, optimizer, scheduler
示例#11
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)  #def: 42
    save_dir = increment_path(os.path.join(model_dir, args.name))

    # -- settings
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # -- dataset
    dataset_module = getattr(import_module("dataset"),
                             args.dataset)  # default: MaskSplitbyProfile
    dataset = dataset_module(data_dir=data_dir, )
    num_classes = dataset.num_classes  # 18

    # -- data_loader
    #train_csv = pd.read_csv("/opt/ml/input/data/train/train.csv")

    labels = []
    print("Get Labels from dataset...")
    for i in tqdm(range(len(dataset))):
        _, label = dataset[i]
        labels.append(label)
    labels = np.array(labels)

    # -- augmentation
    '''
    transform_module = getattr(import_module("dataset"), args.augmentation)  # default: BaseAugmentation
    transform = transform_module(
        resize=args.resize,
        mean=dataset.mean,
        std=dataset.std,
    )
    dataset.set_transform(transform)
    '''
    '''
    stratifiedkfold = StratifiedKFold(n_splits = 5,random_state = 42, shuffle = True)
    folds = []
    
    
    
    
    
    print("Total img counts : ", len(labels))
    
    
    for fold_index, (train_idx, valid_idx) in tqdm(enumerate(stratifiedkfold.split(range(len(labels)), labels))) :
        folds.append({'train' : train_idx, 'valid' : valid_idx})
        
    print()
    print(f'[fold: {fold_index+1}, total fold: {len(folds)}]')
    print(len(train_idx), len(valid_idx))
    print(train_idx)
    print(valid_idx)
    for fold in folds :
        train_subset = Subset(dataset=dataset, indices=train_idx)
        valid_subset = Subset(dataset=dataset, indices=valid_idx)
        train_loader = DataLoader(dataset=train_subset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=4,
                                  pin_memory=use_cuda,
                                  drop_last=True,
                                 )
        val_loader = DataLoader(dataset=valid_subset,
                                  batch_size=args.valid_batch_size,
                                  shuffle=True,
                                  num_workers=4,
                                  pin_memory=use_cuda,
                                  drop_last=True,
                                 )
    '''
    train_set, val_set = dataset.split_dataset()

    train_loader = DataLoader(
        train_set,
        batch_size=args.batch_size,
        num_workers=4,
        shuffle=True,
        pin_memory=use_cuda,
        drop_last=True,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=args.valid_batch_size,
        num_workers=4,
        shuffle=False,
        pin_memory=use_cuda,
        drop_last=True,
    )

    # -- model
    model_module = getattr(import_module("model"),
                           args.model)  # default: BaseModel
    model = model_module(num_classes=num_classes).to(device)
    model = torch.nn.DataParallel(model)
    #torch.nn.DataParallel : https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html

    # -- loss & metric
    #criterion = create_criterion(args.criterion)  # default: cross_entropy

    df_label = pd.Series(labels)
    label_sorted = df_label.value_counts().sort_index()
    n_label = torch.Tensor(label_sorted.values)
    gamma = 2
    normed_weights = [1 - (gamma * x / sum(n_label)) for x in n_label]
    normed_weights = torch.FloatTensor(normed_weights).to(device)
    criterion = torch.nn.CrossEntropyLoss(weight=normed_weights)
    criterion = create_criterion(args.criterion)

    #optimizer = madgrad.MADGRAD(params : any, lr = 0.001, momentum = 0.9, weight_decay = 0, eps = 1e-06)
    try:
        opt_module = getattr(import_module("torch.optim"),
                             args.optimizer)  # default: SGD
    except AttributeError:
        opt_module = getattr(import_module("madgrad"), args.optimizer)
    optimizer = opt_module(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=args.lr,
                           weight_decay=0)
    #scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=4, eta_min=0.000005)
    scheduler = StepLR(optimizer, args.lr_decay_step, gamma=args.gamma)

    # -- logging
    logger = SummaryWriter(log_dir=save_dir)
    with open(os.path.join(save_dir, 'config.json'), 'w',
              encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    best_val_acc = 0
    best_val_loss = np.inf
    best_val_f1 = 0

    #train starts
    for epoch in tqdm(range(args.epochs)):
        # train loop
        print()
        model.train()
        loss_value = 0
        matches = 0
        for idx, train_batch in enumerate(train_loader):
            inputs, labels = train_batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outs = model(inputs)
            preds = torch.argmax(outs, dim=-1)
            loss = criterion(outs, labels) + criterion2(outs, labels)
            #loss = criterion(outs, labels)
            loss.backward()
            optimizer.step()

            loss_value += loss.item()
            matches += (preds == labels).sum().item()

            labels = labels.cpu().detach().numpy()
            preds = preds.cpu().detach().numpy()
            train_f1 = f1_score(labels, preds, average='macro')

            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch+1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss:4.4} || training accuracy {train_acc:4.4%} || lr {current_lr} || "
                    f"F1_score {train_f1:4.4} ")
                logger.add_scalar("Train/loss", train_loss,
                                  epoch * len(train_loader) + idx)
                logger.add_scalar("Train/accuracy", train_acc,
                                  epoch * len(train_loader) + idx)
                logger.add_scalar("Train/f1_score", train_f1,
                                  epoch * len(train_loader) + idx)
                loss_value = 0
                matches = 0

        scheduler.step()  #lr scheduler

        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()
            val_loss_items = []
            val_acc_items = []
            val_f1_items = []
            figure = None
            for val_batch in val_loader:
                inputs, labels = val_batch
                inputs = inputs.to(device)
                labels = labels.to(device)

                outs = model(inputs)
                preds = torch.argmax(outs, dim=-1)

                loss_item = criterion(outs, labels).item()
                loss_item2 = criterion2(outs, labels).item()
                loss_item = list(
                    np.add(np.array(loss_item), np.array(loss_item2)))
                acc_item = (labels == preds).sum().item()

                labels = labels.cpu().detach().numpy()
                preds = preds.cpu().detach().numpy()
                f1_item = f1_score(labels, preds, average='macro')

                val_loss_items.append(loss_item)
                val_acc_items.append(acc_item)
                val_f1_items.append(f1_item)

                if figure is None:
                    inputs_np = torch.clone(inputs).detach().cpu().permute(
                        0, 2, 3, 1).numpy()
                    inputs_np = dataset_module.denormalize_image(
                        inputs_np, dataset.mean, dataset.std)
                    figure = grid_image(
                        inputs_np, labels, preds,
                        args.dataset != "MaskSplitByProfileDataset")

            val_loss = np.sum(val_loss_items) / len(val_loader)
            val_acc = np.sum(val_acc_items) / len(val_set)
            val_f1 = np.sum(val_f1_items) / len(val_loader)

            best_val_acc = max(best_val_acc, val_acc)

            if val_loss < best_val_loss:
                print(
                    f"New best model for val_loss : {val_loss:4.4}! saving the best loss model.."
                )
                torch.save(
                    model.module.state_dict(),
                    f"{save_dir}/{args.model}_epoch{epoch}_loss_{val_loss}.pth"
                )
                best_val_loss = val_loss
            if val_f1 > best_val_f1:
                print(
                    f"New best model for val_F1_score : {val_f1:4.4}! saving the best F1_score model.."
                )
                torch.save(
                    model.module.state_dict(),
                    f"{save_dir}/{args.model}_epoch{epoch}_f1_{val_f1}.pth")
                best_val_f1 = val_f1
            print(
                f"[Val] loss: {val_loss:4.4}, F1_score {val_f1:4.4}, acc : {val_acc:4.4%} || "
                f"best loss: {best_val_loss:4.4}, best_F1_score {best_val_f1:4.4} , best acc : {best_val_acc:4.4%} "
            )
            logger.add_scalar("Val/loss", val_loss, epoch)
            logger.add_scalar("Val/accuracy", val_acc, epoch)
            logger.add_scalar("Val/f1_score", val_f1, epoch)
            logger.add_figure("results", figure, epoch)
            print()
示例#12
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)

    save_dir = increment_path(os.path.join(model_dir, args.name))

    # -- settings
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    info = pd.read_csv('/opt/ml/input/data/train/train.csv')

    info['gender_age'] = info.apply(lambda x: convert_gender_age(x.gender, x.age), axis = 1)
    n_fold = int(1 / args.val_ratio)

    skf = StratifiedKFold(n_splits = n_fold, shuffle=True)
    info.loc[:, 'fold'] = 0
    for fold_num, (train_index, val_index) in enumerate(skf.split(X = info.index, y = info.gender_age.values)):
        info.loc[info.iloc[val_index].index, 'fold'] = fold_num

    fold_idx = 0
    train = info[info.fold != fold_idx].reset_index(drop=True)
    val = info[info.fold == fold_idx].reset_index(drop=True)

    # -- dataset
    dataset_module = getattr(import_module("dataset"), args.dataset)  # default: MaskDataset

    # -- augmentation
    train_transform_module = getattr(import_module("dataset"), args.train_augmentation)  # default: BaseAugmentation
    val_transform_module = getattr(import_module("dataset"), args.val_augmentation)  # default: BaseAugmentation

    train_transform = train_transform_module(
        resize=args.resize,
        mean=MEAN,
        std=STD,
    )
    val_transform = val_transform_module(
        resize=args.resize,
        mean=MEAN,
        std=STD,
    )
    
    print(train_transform.transform, val_transform.transform)

    if args.dataset == 'MaskDataset' or args.dataset == 'MaskOldDataset':
        if args.dataset == 'MaskOldDataset':
            old_transform_module = getattr(import_module('dataset'), args.old_augmentation)

            old_transform = old_transform_module(
                resize=args.resize,
                mean=MEAN,
                std=STD,
            )
            train_dataset = dataset_module(data_dir, train, train_transform, old_transform)
            if args.val_old:
                val_dataset = dataset_module(data_dir, val, val_transform, old_transform)
            else:
                val_dataset = dataset_module(data_dir, val, val_transform)
        else:
            train_dataset = dataset_module(data_dir, train, train_transform)
            val_dataset = dataset_module(data_dir, val, val_transform)
    else:
        dataset = dataset_module(
            data_dir=data_dir,
        )

        # dataset.set_transform(transform)
        # -- data_loader
        train_set, val_set = dataset.split_dataset()

        train_dataset = DatasetFromSubset(
            train_set, transform = train_transform
        )
        val_dataset = DatasetFromSubset(
            val_set, transform = val_transform
        )

    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        num_workers=4,
        shuffle=True,
        pin_memory=use_cuda,
        #drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=args.valid_batch_size,
        num_workers=1,
        shuffle=False,
        pin_memory=use_cuda,
        #drop_last=True,
    )

    # -- model
    model_module = getattr(import_module("model"), args.model)  # default: BaseModel
    model = model_module(
        num_classes=args.num_classes
    ).to(device)
    model = torch.nn.DataParallel(model)

    # -- loss & metric
    if args.criterion == 'f1' or args.criterion == 'label_smoothing':
        criterion = create_criterion(args.criterion, classes = args.num_classes)
    else:
        criterion = create_criterion(args.criterion)
    
    opt_module = getattr(import_module("torch.optim"), args.optimizer)  # default: SGD
    optimizer = opt_module(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=args.lr,
        weight_decay=5e-4
    )
    if args.scheduler == 'cosine':
        scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-6)
    elif args.scheduler == 'reduce':
        scheduler = ReduceLROnPlateau(optimizer, factor = 0.5, patience = 5)
    elif args.scheduler == 'step':
        scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)
    else:
        scheduler = None

    # -- logging
    logger = SummaryWriter(log_dir=save_dir)
    with open(os.path.join(save_dir, 'config.json'), 'w', encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    best_val_acc = 0
    best_val_loss = np.inf
    print("This notebook use [%s]."%(device))

    early_stopping = EarlyStopping(patience = args.patience, verbose = True)

    for epoch in range(args.epochs):
        # train loop
        model.train()
        loss_value = 0
        matches = 0

        train_loss, train_acc = AverageMeter(), AverageMeter()

        for idx, train_batch in enumerate(train_loader):
            inputs, labels = train_batch
            if args.dataset == 'MaskDataset' or args.dataset == 'MaskOldDataset':
                labels = labels.argmax(dim = -1)
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outs = model(inputs)
            preds = torch.argmax(outs, dim=-1)
            loss = criterion(outs, labels)

            loss.backward()
            optimizer.step()

            #loss_value += loss.item()
            #matches += (preds == labels).sum().item()
            acc = (preds == labels).sum().item() / len(labels)

            train_loss.update(loss.item(), len(labels))
            train_acc.update(acc, len(labels))

            if (idx + 1) % args.log_interval == 0:
                #train_loss = loss_value / args.log_interval
                #train_acc = matches / args.batch_size / args.log_interval
                train_f1_acc = f1_score(preds.cpu().detach().type(torch.int), labels.cpu().detach().type(torch.int), average='macro')
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch + 1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss.avg:.4f} || training accuracy {train_acc.avg:4.2%} || train_f1_acc {train_f1_acc:.4} || lr {current_lr}"
                )
                logger.add_scalar("Train/loss", train_loss.avg, epoch * len(train_loader) + idx)
                logger.add_scalar("Train/accuracy", train_acc.avg, epoch * len(train_loader) + idx)

                loss_value = 0
                matches = 0

        scheduler.step()
        
        val_loss, val_acc = AverageMeter(), AverageMeter()
        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()
            val_labels_items = np.array([])
            val_preds_items = np.array([])
            figure = None
            for val_batch in val_loader:
                inputs, labels = val_batch
                if args.dataset == 'MaskDataset' or args.dataset == 'MaskOldDataset':
                    labels = labels.argmax(dim = -1)

                inputs = inputs.to(device)
                labels = labels.to(device)

                outs = model(inputs)
                preds = torch.argmax(outs, dim=-1)

                #loss_item = criterion(outs, labels).item()
                #acc_item = (labels == preds).sum().item()
                #val_loss_items.append(loss_item)
                #val_acc_items.append(acc_item)
                
                loss = criterion(outs, labels)
                acc = (preds == labels).sum().item() / len(labels)

                val_loss.update(loss.item(), len(labels))
                val_acc.update(acc, len(labels))

                val_labels_items = np.concatenate([val_labels_items, labels.cpu().numpy()])
                val_preds_items = np.concatenate([val_preds_items, preds.cpu().numpy()])

                if figure is None:
                    if epoch % 2:
                        images, labels, preds = get_all_datas(model, device, val_loader)
                        figure = log_confusion_matrix(labels.cpu().numpy(), np.argmax(preds.cpu().numpy(), axis=1), args.num_classes)
                        # figure2 = plots_result(images.cpu().numpy()[:36], labels.cpu().numpy()[:36], preds.cpu().numpy()[:36], args.num_classes, title="plots_result")
                    else:
                        inputs_np = torch.clone(inputs).detach().cpu().permute(0, 2, 3, 1).numpy()
                        inputs_np = val_dataset.denormalize_image(inputs_np, MEAN, STD)
                        figure = grid_image(inputs_np, labels, preds, 9, False)

            # val_loss = np.sum(val_loss_items) / len(val_loader)
            # val_acc = np.sum(val_acc_items) / len(val_set)
            val_f1_acc = f1_score(val_labels_items.astype(np.int), val_preds_items.astype(np.int), average='macro')
            
            best_val_acc = max(best_val_acc, val_acc.avg)
            # best_val_loss = min(best_val_loss, val_loss)
            if val_loss.avg < best_val_loss:
                print(f"New best model for val loss : {val_loss.avg:4.2%}! saving the best model..")
                torch.save(model.module.state_dict(), f"{save_dir}/best.pth")
                best_val_loss = val_loss.avg
            torch.save(model.module.state_dict(), f"{save_dir}/last.pth")
            print(
                f"[Val] acc : {val_acc.avg:4.2%}, loss : {val_loss.avg:.4f} || val_f1_acc : {val_f1_acc:.4} || "
                f"best acc : {best_val_acc:4.2%}, best loss : {best_val_loss:.4f}"
            )
            logger.add_scalar("Val/loss", val_loss.avg, epoch)
            logger.add_scalar("Val/accuracy", val_acc.avg, epoch)
            logger.add_figure("results", figure, epoch)
            # logger.add_figure("results1", figure2, epoch)
            
            early_stopping(val_loss.avg, model)

            if early_stopping.early_stop:
                print('Early stopping...')
                break

            print()
示例#13
0
def train_model(config, wandb):

    seed_everything(config.seed)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    model_module = getattr(import_module("model"), config.model)
    model = model_module(num_classes=18).to(device)

    #model = torch.nn.DataParallel(model)

    ########  DataSet

    transform = DataAugmentation(type=config.transform)  #center_384_1
    dataset = MaskDataset(config.data_dir, transform=transform)

    len_valid_set = int(config.data_ratio * len(dataset))
    len_train_set = len(dataset) - len_valid_set
    dataloaders, batch_num = {}, {}

    train_dataset, valid_dataset = torch.utils.data.random_split(
        dataset, [len_train_set, len_valid_set])
    if config.random_split == 0:
        print("tbd")

    sampler = None

    if config.sampler == 'ImbalancedDatasetSampler':
        sampler = ImbalancedDatasetSampler(train_dataset)

    use_cuda = torch.cuda.is_available()

    dataloaders['train'] = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        sampler=sampler,
        shuffle=False,
        num_workers=4,
        pin_memory=use_cuda)

    dataloaders['valid'] = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=use_cuda)

    batch_num['train'], batch_num['valid'] = len(dataloaders['train']), len(
        dataloaders['valid'])

    #Loss
    criterion = create_criterion(config.criterion)

    #Optimizer
    optimizer = optim.SGD(model.parameters(), lr=config.lr, momentum=0.9)

    if config.optim == "AdamP":
        optimizer = AdamP(model.parameters(),
                          lr=config.lr,
                          betas=(0.9, 0.999),
                          weight_decay=config.weight_decay)
    elif config.optim == "AdamW":
        optimizer = optim.AdamW(model.parameters(),
                                lr=config.lr,
                                weight_decay=config.weight_decay)

    #Scheduler
    # Decay LR by a factor of 0.1 every 7 epochs
    #exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
    if config.lr_scheduler == "cosine":
        print('cosine')
        Q = math.floor(len(train_dataset) / config.batch_size +
                       1) * config.epochs / 7
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=Q)
        #ConsineAnnealingWarmRestarts

    since = time.time()
    low_train = 0
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    train_loss, train_acc, valid_loss, valid_acc = [], [], [], []
    num_epochs = config.epochs
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss, running_corrects, num_cnt = 0.0, 0, 0
            runnnig_f1 = 0

            # Iterate over data.
            idx = 0
            for inputs, labels in dataloaders[phase]:
                idx += 1
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    else:
                        runnnig_f1 += f1_score(labels.data.detach().cpu(),
                                               preds.detach().cpu(),
                                               average='macro')

                # statistics
                val_loss = loss.item() * inputs.size(0)
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                num_cnt += len(labels)
                if idx % 100 == 0:
                    _loss = loss.item() / config.batch_size
                    print(
                        f"Epoch[{epoch}/{config.epochs}]({idx}/{batch_num[phase]}) || "
                        f"{phase} loss {_loss:4.4} ")

            if phase == 'train':
                scheduler.step()

            epoch_loss = float(running_loss / num_cnt)
            epoch_acc = float(
                (running_corrects.double() / num_cnt).cpu() * 100)
            epoch_f1 = float(runnnig_f1 / num_cnt)
            if phase == 'train':
                train_loss.append(epoch_loss)
                train_acc.append(epoch_acc)
                if config.wandb:
                    wandb.log({"Train acc": epoch_acc})
            else:
                valid_loss.append(epoch_loss)
                valid_acc.append(epoch_acc)
                if config.wandb:
                    wandb.log({"Valid acc": epoch_acc})
                    wandb.log({"F1 Score": epoch_f1})

            print('{} Loss: {:.2f} Acc: {:.1f} f1 :{:.3f}'.format(
                phase, epoch_loss, epoch_acc, epoch_f1))

            # deep copy the model
            if phase == 'valid':
                if epoch_acc > best_acc:
                    best_idx = epoch
                    best_acc = epoch_acc
                    best_model_wts = copy.deepcopy(model.state_dict())
                    print('==> best model saved - %d / %.1f' %
                          (best_idx, best_acc))
                    low_train = 0
                elif epoch_acc < best_acc:
                    print('==> model finish')
                    low_train += 1

        if low_train > 0 and epoch > 4:
            break

        if phase == 'valid':
            if epoch_acc < 80:
                print('Stop valid is so low')
                break

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best valid Acc: %d - %.1f' % (best_idx, best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    #torch.save(model.state_dict(), 'mask_model.pt')
    torch.save(model.state_dict(), config.name + '.pt')
    print('model saved')
    if config.wandb:
        wandb.finish()
    return model, best_idx, best_acc, train_loss, train_acc, valid_loss, valid_acc
示例#14
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)

    s_dir = args.model + str(args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(args.epochs) + \
            '-' + args.criterion + '-' + args.scheduler + '-' + args.optimizer + '-' + args.dataset + '-' + args.tokenize

    if args.name:
        s_dir += '-' + args.name
    save_dir = increment_path(os.path.join(model_dir, s_dir))

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    print("This notebook use [%s]." % (device))

    # load model and tokenizer
    MODEL_NAME = args.model
    if MODEL_NAME == "monologg/kobert":
        tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    labels = dataset['label'].values

    # setting model hyperparameter
    bert_config = BertConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = args.num_labels
    bert_config.num_hidden_layers = args.num_hidden_layers
    model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                          config=bert_config)
    model.dropout = nn.Dropout(p=args.drop)
    model.to(device)

    summary(model)

    # loss & optimizer
    if args.criterion == 'f1' or args.criterion == 'label_smoothing' or args.criterion == 'f1cross':
        criterion = create_criterion(args.criterion,
                                     classes=args.num_labels,
                                     smoothing=0.1)
    else:
        criterion = create_criterion(args.criterion)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    if args.optimizer == 'AdamP':
        optimizer = AdamP(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.lr,
                          betas=(0.9, 0.999),
                          weight_decay=args.weight_decay)
    else:
        opt_module = getattr(import_module("torch.optim"),
                             args.optimizer)  # default: SGD
        optimizer = opt_module(
            optimizer_grouped_parameters,
            lr=args.lr,
        )

    # logging
    logger = SummaryWriter(log_dir=save_dir)
    with open(os.path.join(save_dir, 'config.json'), 'w',
              encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    set_neptune(save_dir, args)

    # preprocess dataset
    if args.preprocess != 'no':
        pre_module = getattr(import_module("preprocess"), args.preprocess)
        dataset = pre_module(dataset, model, tokenizer)

    # train, val split
    kfold = StratifiedKFold(n_splits=5)

    for train_idx, val_idx in kfold.split(dataset, labels):
        train_dataset, val_dataset = dataset.loc[train_idx], dataset.loc[
            val_idx]
        break

    tok_module = getattr(import_module("load_data"), args.tokenize)

    train_tokenized = tok_module(train_dataset,
                                 tokenizer,
                                 max_len=args.max_len)
    val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(
        train_tokenized, train_dataset['label'].reset_index(drop='index'))
    RE_val_dataset = RE_Dataset(val_tokenized,
                                val_dataset['label'].reset_index(drop='index'))

    train_loader = DataLoader(
        RE_train_dataset,
        batch_size=args.batch_size,
        num_workers=4,
        shuffle=True,
        pin_memory=use_cuda,
    )

    val_loader = DataLoader(
        RE_val_dataset,
        batch_size=12,
        num_workers=1,
        shuffle=False,
        pin_memory=use_cuda,
    )

    if args.scheduler == 'cosine':
        scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-6)
    elif args.scheduler == 'reduce':
        scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
    elif args.scheduler == 'step':
        scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)
    elif args.scheduler == 'cosine_warmup':
        t_total = len(train_loader) * args.epochs
        warmup_step = int(t_total * args.warmup_ratio)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_step,
            num_training_steps=t_total)
    else:
        scheduler = None

    print("Training Start!!!")

    best_val_acc = 0
    best_val_loss = np.inf

    for epoch in range(args.epochs):
        # train loop
        model.train()

        train_loss, train_acc = AverageMeter(), AverageMeter()

        for idx, train_batch in enumerate(train_loader):
            optimizer.zero_grad()

            try:
                inputs, token_types, attention_mask, labels = train_batch.values(
                )
                inputs = inputs.to(device)
                token_types = token_types.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outs = model(input_ids=inputs,
                             token_type_ids=token_types,
                             attention_mask=attention_mask)
            except:
                inputs, attention_mask, labels = train_batch.values()
                inputs = inputs.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outs = model(input_ids=inputs, attention_mask=attention_mask)

            preds = torch.argmax(outs.logits, dim=-1)
            loss = criterion(outs.logits, labels)
            acc = (preds == labels).sum().item() / len(labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7)
            optimizer.step()

            if scheduler:
                scheduler.step()

            neptune.log_metric('learning_rate', get_lr(optimizer))

            train_loss.update(loss.item(), len(labels))
            train_acc.update(acc, len(labels))

            if (idx + 1) % args.log_interval == 0:
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch + 1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss.avg:.4f} || training accuracy {train_acc.avg:4.2%} || lr {current_lr}"
                )
                logger.add_scalar("Train/loss", train_loss.avg,
                                  epoch * len(train_loader) + idx)
                logger.add_scalar("Train/accuracy", train_acc.avg,
                                  epoch * len(train_loader) + idx)

        neptune.log_metric(f'Train_loss', train_loss.avg)
        neptune.log_metric(f'Train_avg', train_acc.avg)
        neptune.log_metric('learning_rate', current_lr)

        val_loss, val_acc = AverageMeter(), AverageMeter()
        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()

            for val_batch in val_loader:
                try:
                    inputs, token_types, attention_mask, labels = val_batch.values(
                    )
                    inputs = inputs.to(device)
                    token_types = token_types.to(device)
                    attention_mask = attention_mask.to(device)
                    labels = labels.to(device)
                    outs = model(input_ids=inputs,
                                 token_type_ids=token_types,
                                 attention_mask=attention_mask)
                except:
                    inputs, attention_mask, labels = val_batch.values()
                    inputs = inputs.to(device)
                    attention_mask = attention_mask.to(device)
                    labels = labels.to(device)
                    outs = model(input_ids=inputs,
                                 attention_mask=attention_mask)

                preds = torch.argmax(outs.logits, dim=-1)
                loss = criterion(outs.logits, labels)
                acc = (preds == labels).sum().item() / len(labels)

                val_loss.update(loss.item(), len(labels))
                val_acc.update(acc, len(labels))

            if val_acc.avg > best_val_acc:
                print(
                    f"New best model for val acc : {val_acc.avg:4.2%}! saving the best model.."
                )
                torch.save(model.state_dict(), f"{save_dir}/best.pth")
                best_val_acc = val_acc.avg
                best_val_loss = min(best_val_loss, val_loss.avg)

            print(
                f"[Val] acc : {val_acc.avg:4.2%}, loss : {val_loss.avg:.4f} || "
                f"best acc : {best_val_acc:4.2%}, best loss : {best_val_loss:.4f}"
            )
            logger.add_scalar("Val/loss", val_loss.avg, epoch)
            logger.add_scalar("Val/accuracy", val_acc.avg, epoch)
            neptune.log_metric(f'Val_loss', val_loss.avg)
            neptune.log_metric(f'Val_avg', val_acc.avg)

            print()
示例#15
0
def train_no_val(img_dir, model_dir, args):
    seed_everything(args.seed)

    start = time.time()
    get_current_time()

    save_dir = increment_path(os.path.join(model_dir, args.name))

    # settings
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # dataset
    dataset_module = getattr(import_module("dataset"), args.dataset)
    dataset = dataset_module(
        img_dir=img_dir,
        val_ratio=args.val_ratio,
    )
    num_classes = dataset.num_classes

    transform_module = getattr(import_module("dataset"), args.augmentation)
    transform = transform_module(mean=dataset.mean, std=dataset.std)

    dataset.set_transform(transform["train"])

    train_loader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        num_workers=2,
        shuffle=True,
        pin_memory=torch.cuda.is_available(),
        drop_last=True,
    )

    model_module = getattr(import_module("model"), args.model)
    model = model_module(num_classes=num_classes).to(device)

    model = torch.nn.DataParallel(model)

    criterion = create_criterion(args.criterion)

    optimizer = None
    if args.optimizer == "AdamP":
        optimizer = AdamP(model.parameters())
    else:
        opt_module = getattr(import_module("torch.optim"), args.optimizer)
        optimizer = opt_module(
            model.parameters(),
            # filter(lambda p: p.requires_grad, model.parameters()),
            lr=args.lr,
            # weight_decay=5e-4,
        )

    # scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)

    logger = SummaryWriter(log_dir=save_dir)

    best_val_acc = 0
    best_val_loss = np.inf
    best_val_f1 = 0

    for epoch in range(args.epochs):
        model.train()
        train_loss = 0
        train_acc = 0
        train_f1 = 0
        for i, data in enumerate(tqdm(train_loader)):
            imgs, labels = data
            imgs = imgs.float().to(device)
            labels = labels.long().to(device)

            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            preds = torch.argmax(outputs, 1)
            acc = (preds == labels).sum().item() / len(imgs)
            t_f1_score = f1_score(
                labels.cpu().detach().numpy(),
                preds.cpu().detach().numpy(),
                average="macro",
            )

            train_loss += loss
            train_acc += acc
            train_f1 += t_f1_score

            if (i + 1) % args.log_interval == 0:
                train_loss /= args.log_interval
                train_acc /= args.log_interval
                train_f1 /= args.log_interval
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch + 1}/{args.epochs}]({i + 1}/{len(train_loader)}) || trainin_loss {train_loss:.4f} || training acc {train_acc:.4f} || train f1_score {train_f1:.4f} || lr {current_lr}"
                )

                logger.add_scalar("Train/loss", train_loss,
                                  epoch * len(train_loader) + i)
                logger.add_scalar("Train/accuracy", train_acc,
                                  epoch * len(train_loader) + i)
                logger.add_scalar("Train/F1-score", train_f1,
                                  epoch * len(train_loader) + i)

                train_loss = 0
                train_acc = 0
                train_f1 = 0

    torch.save(model.module.state_dict(), f"{save_dir}/last.pth")

    # How much time training taken
    times = time.time() - start
    minute, sec = divmod(times, 60)
    print(f"Finish Training! Taken time is {minute} minutes {sec} seconds")
示例#16
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)

    save_dir = increment_path(os.path.join(model_dir, args.name))

    # gpu setting
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # load dataset
    dataset_module = getattr(import_module("dataset"), args.dataset)
    dataset = dataset_module(data_dir=data_dir, )
    num_classes = dataset.num_classes  # 18

    # apply tranform to dataset
    transform_module = getattr(import_module("dataset"), args.augmentation)
    transform = transform_module(
        resize=args.resize,
        mean=dataset.mean,
        std=dataset.std,
    )
    dataset.set_transform(transform)

    # create dataloader
    train_set, val_set = dataset.split_dataset()

    train_loader = DataLoader(
        train_set,
        batch_size=args.batch_size,
        num_workers=1,
        shuffle=True,
        pin_memory=use_cuda,
        drop_last=True,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=args.valid_batch_size,
        num_workers=1,
        shuffle=False,
        pin_memory=use_cuda,
        drop_last=True,
    )

    # create model
    model_module = getattr(import_module("model"), args.model)
    model = model_module(num_classes=num_classes).to(device)

    # load weights of pretrained model
    weight_path = f"{model_dir}/efficientnetb4_sgd2/last.pth"
    model.load_state_dict(torch.load(weight_path))
    model = torch.nn.DataParallel(model)

    # create criterion, optimizer and scheduler
    criterion = create_criterion(args.criterion)
    if args.optimizer == "madgrad":
        opt_module = MADGRAD
    else:
        opt_module = getattr(import_module("torch.optim"), args.optimizer)
    optimizer = opt_module(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=args.lr,
        weight_decay=5e-4,
    )
    scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)

    best_val_acc = 0
    best_val_loss = np.inf
    best_val_f1 = 0
    for epoch in range(args.epochs):
        # training a model
        model.train()
        loss_value = 0
        matches = 0
        for idx, train_batch in enumerate(train_loader):
            inputs, labels = train_batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outs = model(inputs)
            preds = torch.argmax(outs, dim=-1)
            loss = criterion(outs, labels)

            loss.backward()
            optimizer.step()

            loss_value += loss.item()
            matches += (preds == labels).sum().item()

            if (idx + 1) % args.log_interval == 0:
                train_loss = loss_value / args.log_interval
                train_acc = matches / args.batch_size / args.log_interval
                current_lr = get_lr(optimizer)

                print(
                    f"Epoch[{epoch+1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}"
                )

                loss_value = 0
                matches = 0

        scheduler.step()

        # validate a model
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()
            val_loss_items = []
            val_acc_items = []
            val_f1_items = []
            figure = None
            for val_batch in val_loader:
                inputs, labels = val_batch
                inputs = inputs.to(device)
                labels = labels.to(device)

                outs = model(inputs)
                preds = torch.argmax(outs, dim=-1)

                loss_item = criterion(outs, labels).item()
                acc_item = (labels == preds).sum().item()
                f1_item = f1_score(labels.cpu().numpy(),
                                   preds.cpu().numpy(),
                                   average="macro")
                val_loss_items.append(loss_item)
                val_acc_items.append(acc_item)
                val_f1_items.append(f1_item)

                if figure is None:
                    inputs_np = (torch.clone(inputs).detach().cpu().permute(
                        0, 2, 3, 1).numpy())
                    inputs_np = dataset_module.denormalize_image(
                        inputs_np, dataset.mean, dataset.std)
                    figure = grid_image(
                        inputs_np,
                        labels,
                        preds,
                        args.dataset != "MaskSplitByProfileDataset",
                    )

            val_loss = np.sum(val_loss_items) / len(val_loader)
            val_acc = np.sum(val_acc_items) / len(val_set)
            val_f1 = np.sum(val_f1_items) / len(val_loader)

            best_val_loss = min(best_val_loss, val_loss)

            if val_acc > best_val_acc:
                best_val_acc = val_acc  # update the minimum loss

            if val_f1 > best_val_f1:
                print(
                    f"New best model for val f1 : {val_f1:4.2f}! saving the best model.."
                )
                # save the model
                torch.save(model.module.state_dict(),
                           f"{save_dir}/best_eph_{epoch}.pth")
                best_val_f1 = val_f1  # update the maximum f1 score

            torch.save(model.module.state_dict(), f"{save_dir}/last.pth")
            print(
                f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.2}, f1: {val_f1:4.2f} || "
                f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}, best f1: {best_val_f1:4.2f}"
            )

            print()
示例#17
0
def train(img_dir, model_dir, args):
    seed_everything(args.seed)

    start = time.time()
    get_current_time()

    save_dir = increment_path(os.path.join(model_dir, args.name))

    # settings
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # dataset
    dataset_module = getattr(import_module("dataset"), args.dataset)
    dataset = dataset_module(
        img_dir=img_dir,
        val_ratio=args.val_ratio,
    )
    num_classes = dataset.num_classes

    transform_module = getattr(import_module("dataset"), args.augmentation)
    transform = transform_module(mean=dataset.mean, std=dataset.std)

    train_dataset, val_dataset = dataset.split_dataset()
    train_dataset.dataset.set_transform(transform["train"])
    val_dataset.dataset.set_transform(transform["val"])

    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        num_workers=2,
        shuffle=True,
        pin_memory=torch.cuda.is_available(),
        drop_last=True,
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=args.valid_batch_size,
        num_workers=2,
        shuffle=False,
        pin_memory=torch.cuda.is_available(),
        drop_last=True,
    )

    model_module = getattr(import_module("model"), args.model)
    model = model_module(num_classes=num_classes).to(device)

    model = torch.nn.DataParallel(model)

    criterion = create_criterion(args.criterion)

    optimizer = None
    if args.optimizer == "AdamP":
        optimizer = AdamP(model.parameters(), lr=args.lr)
    else:
        opt_module = getattr(import_module("torch.optim"), args.optimizer)
        optimizer = opt_module(
            model.parameters(),
            # filter(lambda p: p.requires_grad, model.parameters()),
            lr=args.lr,
            # weight_decay=5e-4,
        )

    # scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)

    logger = SummaryWriter(log_dir=save_dir)

    best_val_acc = 0
    best_val_loss = np.inf
    best_val_f1 = 0

    for epoch in range(args.epochs):
        model.train()
        train_loss = 0
        train_acc = 0
        train_f1 = 0
        for i, data in enumerate(tqdm(train_loader)):
            imgs, labels = data
            imgs = imgs.float().to(device)
            labels = labels.long().to(device)

            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            preds = torch.argmax(outputs, 1)
            acc = (preds == labels).sum().item() / len(imgs)
            t_f1_score = f1_score(
                labels.cpu().detach().numpy(),
                preds.cpu().detach().numpy(),
                average="macro",
            )

            train_loss += loss
            train_acc += acc
            train_f1 += t_f1_score

            if (i + 1) % args.log_interval == 0:
                train_loss /= args.log_interval
                train_acc /= args.log_interval
                train_f1 /= args.log_interval
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch + 1}/{args.epochs}]({i + 1}/{len(train_loader)}) || trainin_loss {train_loss:.4f} || training acc {train_acc:.4f} || train f1_score {train_f1:.4f} || lr {current_lr}"
                )

                logger.add_scalar("Train/loss", train_loss,
                                  epoch * len(train_loader) + i)
                logger.add_scalar("Train/accuracy", train_acc,
                                  epoch * len(train_loader) + i)
                logger.add_scalar("Train/F1-score", train_f1,
                                  epoch * len(train_loader) + i)

                train_loss = 0
                train_acc = 0
                train_f1 = 0

        # scheduler.step()

        # training은 1 epoch이 끝나야 완료된 것
        # 학습이 끝난 각 epoch에서 최고의 score를 가진 것을 저장하는 것
        with torch.no_grad():
            print("Validation step---------------------")
            model.eval()
            val_loss_items = []
            val_acc_items = []
            val_f1_items = []

            for data in tqdm(val_loader):
                imgs, labels = data
                imgs = imgs.float().to(device)
                labels = labels.long().to(device)

                outputs = model(imgs)
                preds = torch.argmax(outputs, 1)

                loss = criterion(outputs, labels).item()
                acc = (labels == preds).sum().item()
                val_f1 = f1_score(
                    labels.cpu().detach().numpy(),
                    preds.cpu().detach().numpy(),
                    average="macro",
                )

                val_loss_items.append(loss)
                val_acc_items.append(acc)
                val_f1_items.append(val_f1)

            val_loss = np.sum(val_loss_items) / len(val_loader)
            val_acc = np.sum(val_acc_items) / len(val_dataset)
            val_f1 = np.sum(val_f1_items) / len(val_loader)

            print(
                f"val_loader: {len(val_loader)} | val_dataset: {len(val_dataset)}"
            )

            best_val_loss = min(best_val_loss, val_loss)
            best_val_f1 = max(val_f1, best_val_f1)
            best_val_acc = max(val_acc, best_val_acc)

            # if val_acc > best_val_acc:
            # print(
            #     f"New best model for val acc: {val_acc:4.2%}! saving the best model..."
            # )
            #     torch.save(model.module.state_dict(), f"{save_dir}/best.pth")
            #     best_val_acc = val_acc

            if val_f1 > best_val_f1:
                print(
                    f"New best model for val f1: {val_f1:.4f}! saving the best model..."
                )
                torch.save(model.module.state_dict(), f"{save_dir}/best.pth")
                best_val_f1 = val_f1

            # TODO: last model 저장이 여기 위치가 맞나 ??
            # torch.save(model.module.state_dict(), f"{save_dir}/last.pth")
            print(
                f"[Val] acc: {val_acc:.4f}, loss: {val_loss:.4f} || best acc: {best_val_acc:.4f}, best loss: {best_val_loss:.4f}"
            )

            logger.add_scalar("Val/loss", val_loss, epoch)
            logger.add_scalar("Val/accuracy", val_acc, epoch)
            logger.add_scalar("Val/f1-score", val_f1, epoch)
            print()

    torch.save(model.module.state_dict(), f"{save_dir}/last.pth")

    # How much time training taken
    times = time.time() - start
    minute, sec = divmod(times, 60)
    print(f"Finish Training! Taken time is {minute} minutes {sec} seconds")
示例#18
0
def start(config, wandb):
    # Loss function 정의
    dataset_path = '/opt/ml/input/data'
    test_path = dataset_path + '/test.json'

    num = config.data_ratio

    if num == -1:
        train_path = dataset_path + '/train.json'
        val_path = dataset_path + '/val.json'
    else:
        train_path = dataset_path + '/train_data' + str(num) + '.json'
        val_path = dataset_path + '/valid_data' + str(num) + '.json'

    print(train_path)
    print(val_path)
    seed_everything(config.seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    print('pytorch version: {}'.format(torch.__version__))
    print('GPU 사용 가능 여부: {}'.format(torch.cuda.is_available()))

    print(torch.cuda.get_device_name(0))
    print(torch.cuda.device_count())

    train_transform = getattr(import_module("dataset"),
                              "data_" + config.transform)()

    train_dataset = CustomDataLoader(data_dir=train_path,
                                     mode='train',
                                     transform=train_transform)
    #train_dataset=CutMix(train_dataset, num_class=12, beta=1.0, prob=0.5, num_mix=2)

    # validation dataset
    val_transform = getattr(import_module("dataset"),
                            "data_" + config.vtransform)()
    val_dataset = CustomDataLoader(data_dir=val_path,
                                   mode='val',
                                   transform=val_transform)

    batch_size = config.batch_size
    # DataLoader

    # create own Dataset 1 (skip)
    # validation set을 직접 나누고 싶은 경우
    # random_split 사용하여 data set을 8:2 로 분할
    # train_size = int(0.8*len(dataset))
    # val_size = int(len(dataset)-train_size)
    # dataset = CustomDataLoader(data_dir=train_path, mode='train', transform=transform)
    # train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=3,
                                               drop_last=True,
                                               collate_fn=collate_fn)

    val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             drop_last=True,
                                             num_workers=2,
                                             collate_fn=collate_fn)

    if config.enc_name == "basic":
        mode_str = "model." + config.model.lower()
        model_module = getattr(import_module(mode_str), config.model)
        model = model_module(num_classes=12).to(device)
    else:
        model_module = get_smp_model(config.model, config.enc_name)
        model = model_module.to(device)

    #Loss
    criterion = create_criterion(config.criterion)

    #criterion = [SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes = 12)]

    #Optimizer
    optimizer = optim.SGD(model.parameters(), lr=config.lr, momentum=0.9)

    if config.optim == "AdamP":
        optimizer = AdamP(model.parameters(),
                          lr=config.lr,
                          betas=(0.9, 0.999),
                          weight_decay=config.weight_decay)
    elif config.optim == "AdamW":
        optimizer = optim.AdamW(model.parameters(),
                                lr=config.lr,
                                weight_decay=config.weight_decay)
    elif config.optim == "Adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=config.lr,
                               weight_decay=config.weight_decay)

    lookahead = Lookahead(optimizer, k=5, alpha=0.5)  # Initialize Lookahead

    scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
    if config.lr_scheduler == "cosine":
        print('cosine')
        #Q = 2
        Q = config.epochs
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                                   T_max=Q,
                                                   eta_min=1e-7)
    elif config.lr_scheduler == "cosinew":
        print(" ConsineAnnealingWarmRestarts ")
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(lookahead,
                                                             T_0=30,
                                                             T_mult=2,
                                                             eta_min=0)
    elif config.lr_scheduler == "cosinew_custom":
        print(
            "https://gaussian37.github.io/dl-pytorch-lr_scheduler/#cosineannealingwarmrestarts-1"
        )
        optimizer = torch.optim.Adam(model.parameters(), lr=0)
        lookahead = Lookahead(optimizer, k=5, alpha=0.5)
        scheduler = CustomCosineAnnealingWarmUpRestarts(optimizer,
                                                        T_0=config.epochs,
                                                        T_mult=1,
                                                        eta_max=config.lr,
                                                        T_up=8,
                                                        gamma=0.5)
    elif config.lr_scheduler == "gradual_warmuplr":
        print("#https://www.kaggle.com/pukkinming/pytorchgradualwarmuplr")

    train(model, train_loader, val_loader, criterion, optimizer, scheduler,
          config, device, lookahead)

    psudo_labeling(model, train_loader, val_loader, criterion, optimizer,
                   scheduler, config, device, lookahead)