Пример #1
0
    def objective(trial):

        config = toml.load(args.config)

        lr = 1e-3
        #config['block'][0]['stride'] = [trial.suggest_int('stride', 4, 6)]

        # C1
        config['block'][0]['kernel'] = [
            int(trial.suggest_discrete_uniform('c1_kernel', 1, 129, 2))
        ]
        config['block'][0]['filters'] = trial.suggest_int(
            'c1_filters', 1, 1024)

        # B1 - B5
        for i in range(1, 6):
            config['block'][i]['repeat'] = trial.suggest_int(
                'b%s_repeat' % i, 1, 9)
            config['block'][i]['filters'] = trial.suggest_int(
                'b%s_filters' % i, 1, 512)
            config['block'][i]['kernel'] = [
                int(trial.suggest_discrete_uniform('b%s_kernel' % i, 1, 129,
                                                   2))
            ]

        # C2
        config['block'][-2]['kernel'] = [
            int(trial.suggest_discrete_uniform('c2_kernel', 1, 129, 2))
        ]
        config['block'][-2]['filters'] = trial.suggest_int(
            'c2_filters', 1, 1024)

        # C3
        config['block'][-1]['kernel'] = [
            int(trial.suggest_discrete_uniform('c3_kernel', 1, 129, 2))
        ]
        config['block'][-1]['filters'] = trial.suggest_int(
            'c3_filters', 1, 1024)

        model = load_symbol(config, 'Model')(config)
        num_params = sum(p.numel() for p in model.parameters())

        print("[trial %s]" % trial.number)

        if num_params > args.max_params:
            print("[pruned] network too large")
            raise optuna.exceptions.TrialPruned()

        model.to(args.device)
        model.train()

        os.makedirs(workdir, exist_ok=True)

        optimizer = AdamW(model.parameters(), amsgrad=True, lr=lr)
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)
        schedular = CosineAnnealingLR(optimizer,
                                      args.epochs * len(train_loader))

        for epoch in range(1, args.epochs + 1):

            try:
                train_loss, duration = train(model,
                                             device,
                                             train_loader,
                                             optimizer,
                                             use_amp=True)
                val_loss, val_mean, val_median = test(model, device,
                                                      test_loader)
                print(
                    "[epoch {}] directory={} loss={:.4f} mean_acc={:.3f}% median_acc={:.3f}%"
                    .format(epoch, workdir, val_loss, val_mean, val_median))
            except KeyboardInterrupt:
                exit()
            except:
                print("[pruned] exception")
                raise optuna.exceptions.TrialPruned()

            if np.isnan(val_loss): val_loss = 9.9
            trial.report(val_loss, epoch)

            if trial.should_prune():
                print("[pruned] unpromising")
                raise optuna.exceptions.TrialPruned()

        trial.set_user_attr('seed', args.seed)
        trial.set_user_attr('val_loss', val_loss)
        trial.set_user_attr('val_mean', val_mean)
        trial.set_user_attr('val_median', val_median)
        trial.set_user_attr('train_loss', train_loss)
        trial.set_user_attr('batchsize', args.batch)
        trial.set_user_attr('model_params', num_params)

        torch.save(model.state_dict(),
                   os.path.join(workdir, "weights_%s.tar" % trial.number))
        toml.dump(
            config,
            open(os.path.join(workdir, 'config_%s.toml' % trial.number), 'w'))

        print("[loss] %.4f" % val_loss)
        return val_loss
def train(train_dataloader, query_dataloader, retrieval_dataloader, arch, code_length, device, lr,
          max_iter, topk, evaluate_interval, anchor_num, proportion
          ):
    #print("using device")
    #print(torch.cuda.current_device())
    #print(torch.cuda.get_device_name(torch.cuda.current_device()))
    # Load model
    model = load_model(arch, code_length).to(device)
    model_mo = load_model_mo(arch).to(device)

    # Create criterion, optimizer, scheduler
    criterion = PrototypicalLoss()
    optimizer = optim.RMSprop(
        model.parameters(),
        lr=lr,
        weight_decay=5e-4,
    )
    scheduler = CosineAnnealingLR(
        optimizer,
        max_iter,
        lr / 100,
    )

    # Initialization
    running_loss = 0.
    best_map = 0.
    training_time = 0.

    # Training
    for it in range(max_iter):
        # timer
        tic = time.time()

        # harvest prototypes/anchors#some times killed, try another way
        with torch.no_grad():
            output_mo = torch.tensor([]).to(device)
            for data, _, _ in train_dataloader:
                data = data.to(device)
                output_mo_temp = model_mo(data)
                output_mo = torch.cat((output_mo, output_mo_temp), 0)
                torch.cuda.empty_cache()

            anchor = get_anchor(output_mo, anchor_num, device)  # compute anchor

        # self-supervised deep learning
        model.train()
        for data, targets, index in train_dataloader:
            data, targets, index = data.to(device), targets.to(device), index.to(device)
            optimizer.zero_grad()

            # output
            output_B = model(data)
            output_mo_batch = model_mo(data)

            # prototypes/anchors based similarity

            #sample_anchor_distance = torch.sqrt(torch.sum((output_mo_batch[:, None, :] - anchor) ** 2, dim=2)).to(device)
            #sample_anchor_dist_normalize = F.normalize(sample_anchor_distance, p=2, dim=1).to(device)
            #S = sample_anchor_dist_normalize @ sample_anchor_dist_normalize.t()

            # loss
            #loss = criterion(output_B, S)
            #running_loss = running_loss + loss.item()
            #loss.backward(retain_graph=True)
            with torch.no_grad():
                dist = torch.sum((output_mo_batch[:, None, :] - anchor.to(device)) ** 2, dim=2)
                k = dist.size(1)
                dist = torch.exp(-1 * dist / torch.max(dist)).to(device)
                Z_su = torch.ones(k, 1).to(device)
                Z_sum = torch.sqrt(dist.mm(Z_su)) + 1e-12
                Z_simi = torch.div(dist, Z_sum).to(device)
                S = (Z_simi.mm(Z_simi.t()))
                S=(2/(torch.max(S)-torch.min(S)))*S-1


            loss = criterion(output_B, S)

            running_loss += loss.item()
            loss.backward()

            optimizer.step()
        with torch.no_grad():
            # momentum update:
            for param_q, param_k in zip(model.parameters(), model_mo.parameters()):
                param_k.data = param_k.data * proportion + param_q.data * (1. - proportion)  # proportion = 0.999 for update

        scheduler.step()
        training_time += time.time() - tic

        # Evaluate
        if it % evaluate_interval == evaluate_interval - 1:
            # Generate hash code
            query_code = generate_code(model, query_dataloader, code_length, device)
            retrieval_code = generate_code(model, retrieval_dataloader, code_length, device)

            query_targets = query_dataloader.dataset.get_onehot_targets()
            retrieval_targets = retrieval_dataloader.dataset.get_onehot_targets()

            # Compute map
            mAP = mean_average_precision(
                query_code.to(device),
                retrieval_code.to(device),
                query_targets.to(device),
                retrieval_targets.to(device),
                device,
                topk,
            )

            # Compute pr curve
            P, R = pr_curve(
                query_code.to(device),
                retrieval_code.to(device),
                query_targets.to(device),
                retrieval_targets.to(device),
                device,
            )

            # Log
            logger.info('[iter:{}/{}][loss:{:.2f}][map:{:.4f}][time:{:.2f}]'.format(
                it + 1,
                max_iter,
                running_loss / evaluate_interval,
                mAP,
                training_time,
            ))
            running_loss = 0.

            # Checkpoint
            if best_map < mAP:
                best_map = mAP

                checkpoint = {
                    'model': model.state_dict(),
                    'qB': query_code.cpu(),
                    'rB': retrieval_code.cpu(),
                    'qL': query_targets.cpu(),
                    'rL': retrieval_targets.cpu(),
                    'P': P,
                    'R': R,
                    'map': best_map,
                }

    return checkpoint
def cos_lr_scheduler(optimizer,t_max=3):
    return CosineAnnealingLR(optimizer, T_max=t_max)
 def configure_optimizers(self):
     optimizer = Adam(self.parameters(), lr=self.hparams.learning_rate)
     # scheduler = StepLR(optimizer, step_size=300)
     scheduler = CosineAnnealingLR(optimizer, self.trainer.max_epochs,
                                   10e-6)
     return {"optimizer": optimizer, "lr_scheduler": scheduler}
def train(args, config, io):
    train_loader, validation_loader, unlabelled_loader = get_loader(
        args, config)

    device = torch.device("cuda" if args.cuda else "cpu")

    #Try to load models
    model = DNN(args).to(device)
    ema_model = DNN(args).to(device)
    for param in ema_model.parameters():
        param.detach_()
    if device == torch.device("cuda"):
        model = nn.DataParallel(model)
        ema_model = nn.DataParallel(ema_model)
    if args.model_path != "":
        model.load_state_dict(torch.load(args.model_path))
        ema_model.load_state_dict(torch.load(args.model_path))

    if args.use_sgd:
        print("Use SGD")
        opt = optim.SGD(model.parameters(),
                        lr=args.lr * 100,
                        momentum=args.momentum,
                        weight_decay=1e-4)
    else:
        print("Use Adam")
        opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

    scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr)

    criterion = nn.MSELoss()
    consistency_criterion = nn.MSELoss()

    best_test_loss = 9999999.
    global_step = 0
    for epoch in range(args.epochs):
        startTime = time.time()

        ####################
        # Train
        ####################
        train_loss = 0.0
        count = 0.0
        model.train()
        ema_model.train()
        i = -1
        for (data, label), (u, _) in zip(cycle(train_loader),
                                         unlabelled_loader):
            i = i + 1
            if data.shape[0] != u.shape[0]:
                bt_size = np.minimum(data.shape[0], u.shape[0])
                data = data[0:bt_size]
                label = label[0:bt_size]
                u = u[0:bt_size]
            data, label, u = data.to(device), label.to(device), u.to(device)
            batch_size = data.shape[0]
            logits = model(data)
            class_loss = criterion(logits, label)

            u_student = jitter(u, device)
            u_teacher = jitter(u, device)
            logits_unlabeled = model(u_student)
            ema_logits_unlabeled = ema_model(u_teacher)
            ema_logits_unlabeled = Variable(ema_logits_unlabeled.detach().data,
                                            requires_grad=False)
            consistency_loss = consistency_criterion(logits_unlabeled,
                                                     ema_logits_unlabeled)
            if epoch < args.consistency_rampup_starts:
                consistency_weight = 0.0
            else:
                consistency_weight = get_current_consistency_weight(
                    args, args.final_consistency, epoch, i,
                    len(unlabelled_loader))

            consistency_loss = consistency_weight * consistency_loss
            loss = class_loss + consistency_loss

            opt.zero_grad()
            loss.backward()
            opt.step()

            global_step += 1
            # print(global_step)
            update_ema_variables(model, ema_model, args.ema_decay, global_step)

            count += batch_size
            train_loss += loss.item() * batch_size
        scheduler.step()
        outstr = 'Train %d, loss: %.6f' % (epoch, train_loss * 1.0 / count)
        io.cprint(outstr)

        ####################
        # Evaluation
        ####################
        test_loss = 0.0
        count = 0.0
        model.eval()
        ema_model.eval()
        for data, label in validation_loader:
            data, label = data.to(device), label.to(device)
            batch_size = data.shape[0]
            logits = ema_model(data)
            loss = criterion(logits, label)
            count += batch_size
            test_loss += loss.item() * batch_size
        outstr = 'Test %d, loss: %.6f' % (epoch, test_loss * 1.0 / count)
        io.cprint(outstr)
        if test_loss <= best_test_loss:
            best_test_loss = test_loss
            torch.save(ema_model.state_dict(),
                       'checkpoints/%s/models/model.t7' % args.exp_name)
            torch.save(ema_model, (config.root + config.model_path))
        io.cprint('Time: %.3f sec' % (time.time() - startTime))
Пример #6
0
class SGHMC(_Inference):
    def __init__(self,
                 hyperparameters,
                 model=None,
                 train_loader=None,
                 model_loss='multi_class_linear_output',
                 device=torch.device('cpu')):
        '''
        :param hyperparameters: Hyperparameters include {'lr', 'prior_std', 'num_samples'}
        :param model: Pytorch model to run SGLD on.
        :param train_loader: DataLoader for train data
        :param model_loss: Loss function to use for the model. (e.g.: 'multi_class_linear_output')
        :param device: Device on which model is present (e.g.: torch.device('cpu'))
        '''
        if hyperparameters == None:
            # Initialise as some default values
            hyperparameters = {
                'lr': 0.001,
                'prior_std': 10,
                'num_samples': 2,
                'alpha': 0.1,
                'burn_in_epochs': 10
            }

        super(SGHMC, self).__init__(hyperparameters, model, train_loader,
                                    device)
        self.lr = hyperparameters['lr']
        self.prior_std = hyperparameters['prior_std']
        self.num_samples = hyperparameters['num_samples']
        self.alpha = hyperparameters['alpha']
        self.burn_in_epochs = hyperparameters['burn_in_epochs']
        self.model_loss = model_loss
        self.model = model
        self.train_loader = train_loader
        self.device = device
        self.dataset_size = len(train_loader.dataset)
        self.optimizer = optimSGHMC(params=self.model.parameters(),
                                    lr=self.lr,
                                    momentum=1 - self.alpha,
                                    num_training_samples=self.dataset_size,
                                    weight_decay=1 / (self.prior_std**2))
        self.loss_criterion = get_loss_criterion(loss=model_loss)
        self.burnt_in = False
        self.epochs_run = 0
        self.lr_final = self.lr / 2
        # self.optimizer_scheduler = CosineAnnealingLR(optimizer=self.optimizer, T_max=
        # (self.burn_in_epochs + self.num_samples), eta_min=self.lr_final)
        self.optimizer_scheduler = OneCycleLR(
            optimizer=self.optimizer,
            max_lr=self.lr * 5,
            steps_per_epoch=len(self.train_loader),
            epochs=self.burn_in_epochs + self.num_samples)

    def update_hyp(self, hyperparameters):
        self.lr = hyperparameters['lr']
        self.prior_std = hyperparameters['prior_std']
        self.num_samples = hyperparameters['num_samples']
        self.alpha = hyperparameters['alpha']
        self.burn_in_epochs = hyperparameters['burn_in_epochs']
        self.model = reset_model(self.model)
        self.burnt_in = False
        self.epochs_run = 0
        self.optimizer = optimSGHMC(params=self.model.parameters(),
                                    lr=self.lr,
                                    momentum=1 - self.alpha,
                                    num_training_samples=self.dataset_size,
                                    weight_decay=1 / (self.prior_std**2))
        self.lr_final = self.lr / 2
        self.optimizer_scheduler = CosineAnnealingLR(
            optimizer=self.optimizer,
            T_max=self.burn_in_epochs + self.num_samples,
            eta_min=self.lr_final)

    def sample_iterative(self,
                         val_loader=None,
                         debug_val_loss=False,
                         wandb_debug=False):
        if issubclass(self.model.__class__, torch.nn.Module):
            if self.burnt_in is False:
                epochs = self.burn_in_epochs + 1
                self.burnt_in = True
            else:
                epochs = 1
            for epoch in range(epochs):
                self.model.train()
                total_epoch_train_loss = 0.
                for batch_idx, (batch_data,
                                batch_labels) in enumerate(self.train_loader):
                    batch_data = batch_data.to(self.device)
                    batch_labels = batch_labels.to(self.device)
                    batch_data_logits = self.model(batch_data)
                    self.optimizer.zero_grad()
                    loss = self.loss_criterion(batch_data_logits, batch_labels)
                    loss.backward()
                    total_epoch_train_loss += loss.item() * len(batch_data)
                    self.optimizer.step(add_langevin_noise=True)
                    self.optimizer_scheduler.step()
                if debug_val_loss:
                    avg_val_loss = self.compute_val_loss(val_loader)
                    avg_train_loss = total_epoch_train_loss / self.dataset_size
                    metrics = {
                        'train_loss': avg_train_loss,
                        'val_loss': avg_val_loss,
                        'lr': self.optimizer_scheduler.get_lr()
                    }
                    print(metrics)
                    if wandb_debug:
                        wandb.log(metrics)
            output_model = deepcopy(self.model.cpu())
            self.model.to(self.device)
            return output_model
        else:
            raise NotImplementedError

    def sample(self,
               num_samples=None,
               val_loader=None,
               debug_val_loss=False,
               wandb_debug=False):
        output_list = []
        if num_samples is None:
            num_samples = self.num_samples
        if issubclass(self.model.__class__, torch.nn.Module):
            for i in range(num_samples):
                output_list.append(
                    self.sample_iterative(val_loader=val_loader,
                                          debug_val_loss=debug_val_loss,
                                          wandb_debug=wandb_debug))
            return output_list
        else:
            raise NotImplementedError
Пример #7
0
def train(args, io):

    # ============= Model ===================
    num_part = 50
    device = torch.device("cuda" if args.cuda else "cpu")

    MODELClass = importlib.import_module(args.model)
    model = MODELClass.get_model(num_part).to(device)
    io.cprint(str(model))

    model.apply(weight_init)
    model = nn.DataParallel(model)
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    '''Use Pretrain or not'''
    try:
        state_dict = torch.load("checkpoints/%s/best_insiou_model.pth" %
                                args.exp_name,
                                map_location=torch.device('cpu'))['model']
        for k in state_dict.keys():
            if 'module' not in k:
                from collections import OrderedDict
                new_state_dict = OrderedDict()
                for k in state_dict:
                    new_state_dict['module.' + k] = state_dict[k]
                state_dict = new_state_dict
            break
        model.load_state_dict(state_dict)

        print("Using pretrained model...")
        print(
            torch.load("checkpoints/%s/best_insiou_model.pth" %
                       args.exp_name).keys())
    except:
        print("Training from scratch...")

    # =========== Dataloader =================
    train_data = PartNormalDataset(npoints=2048,
                                   split='trainval',
                                   normalize=False)
    print("The number of training data is:%d", len(train_data))

    test_data = PartNormalDataset(npoints=2048, split='test', normalize=False)
    print("The number of test data is:%d", len(test_data))

    train_loader = DataLoader(train_data,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              drop_last=True)

    test_loader = DataLoader(test_data,
                             batch_size=args.test_batch_size,
                             shuffle=False,
                             num_workers=8,
                             drop_last=False)

    # ============= Optimizer ================
    if args.use_sgd:
        print("Use SGD")
        opt = optim.SGD(model.parameters(),
                        lr=args.lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay)
    else:
        print("Use Adam")
        opt = optim.Adam(model.parameters(),
                         lr=args.lr,
                         betas=(0.9, 0.999),
                         eps=1e-08,
                         weight_decay=args.weight_decay)

    if args.scheduler == 'cos':
        print("Use CosLR")
        scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr / 100)
    else:
        print("Use StepLR")
        scheduler = StepLR(opt, step_size=args.step, gamma=0.5)

    # ============= Training =================
    best_acc = 0
    best_class_iou = 0
    best_instance_iou = 0
    num_part = 50
    num_classes = 16

    for epoch in range(args.epochs):

        train_epoch(train_loader, model, opt, scheduler, epoch, num_part,
                    num_classes, io)

        test_metrics, total_per_cat_iou = test_epoch(test_loader, model, epoch,
                                                     num_part, num_classes, io)

        # 1. when get the best accuracy, save the model:
        if test_metrics['accuracy'] > best_acc:
            best_acc = test_metrics['accuracy']
            io.cprint('Max Acc:%.5f' % best_acc)
            state = {
                'model':
                model.module.state_dict()
                if torch.cuda.device_count() > 1 else model.state_dict(),
                'optimizer':
                opt.state_dict(),
                'epoch':
                epoch,
                'test_acc':
                best_acc
            }
            torch.save(state,
                       'checkpoints/%s/best_acc_model.pth' % args.exp_name)

        # 2. when get the best instance_iou, save the model:
        if test_metrics['shape_avg_iou'] > best_instance_iou:
            best_instance_iou = test_metrics['shape_avg_iou']
            io.cprint('Max instance iou:%.5f' % best_instance_iou)
            state = {
                'model':
                model.module.state_dict()
                if torch.cuda.device_count() > 1 else model.state_dict(),
                'optimizer':
                opt.state_dict(),
                'epoch':
                epoch,
                'test_instance_iou':
                best_instance_iou
            }
            torch.save(state,
                       'checkpoints/%s/best_insiou_model.pth' % args.exp_name)

        # 3. when get the best class_iou, save the model:
        # first we need to calculate the average per-class iou
        class_iou = 0
        for cat_idx in range(16):
            class_iou += total_per_cat_iou[cat_idx]
        avg_class_iou = class_iou / 16
        if avg_class_iou > best_class_iou:
            best_class_iou = avg_class_iou
            # print the iou of each class:
            for cat_idx in range(16):
                io.cprint(classes_str[cat_idx] + ' iou: ' +
                          str(total_per_cat_iou[cat_idx]))
            io.cprint('Max class iou:%.5f' % best_class_iou)
            state = {
                'model':
                model.module.state_dict()
                if torch.cuda.device_count() > 1 else model.state_dict(),
                'optimizer':
                opt.state_dict(),
                'epoch':
                epoch,
                'test_class_iou':
                best_class_iou
            }
            torch.save(state,
                       'checkpoints/%s/best_clsiou_model.pth' % args.exp_name)

    # report best acc, ins_iou, cls_iou
    io.cprint('Final Max Acc:%.5f' % best_acc)
    io.cprint('Final Max instance iou:%.5f' % best_instance_iou)
    io.cprint('Final Max class iou:%.5f' % best_class_iou)
    # save last model
    state = {
        'model':
        model.module.state_dict()
        if torch.cuda.device_count() > 1 else model.state_dict(),
        'optimizer':
        opt.state_dict(),
        'epoch':
        args.epochs - 1,
        'test_iou':
        best_instance_iou
    }
    torch.save(state,
               'checkpoints/%s/model_ep%d.pth' % (args.exp_name, args.epochs))
Пример #8
0
def train_fold(args, device, save_dir, log, tbx, cross_val = False, fold_idx = None):
    """
    Perform training and evaluate for the current fold
    """         
    # Define loss function
    class_weights = torch.FloatTensor(CLASS_W)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights).to(device) # CrossEntropyLoss includes softmax

    # Get model
    log.info('Building model...')
    if args.model_name == 'SeizureNet':
        model = SeizureNet(args)       
        
    model = nn.DataParallel(model, args.gpu_ids)
    step = 0
    model = model.to(device)
    
    # To train mode
    model.train()

    # Get saver
    saver = utils.CheckpointSaver(save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adam(params=model.parameters(), 
                           lr=args.lr_init, weight_decay=args.l2_wd)
    scheduler = CosineAnnealingLR(optimizer, T_max=args.num_epochs)
    
    # Get data loader    
    log.info('Building dataset...')
    if cross_val:
        seizure_file = os.path.join('data', 'fold' + str(fold_idx) + '_trainSet_seizure_files.txt')
    else:
        seizure_file = TRAIN_SEIZURE_FILE
        
    train_dataset = SeizureDataset(seizure_file, num_folds=args.num_folds, fold_idx=fold_idx, cross_val=cross_val, split='train')
    train_loader = data.DataLoader(dataset=train_dataset,
                                    shuffle=True,
                                    batch_size=args.train_batch_size,
                                    num_workers=args.num_workers)
    
    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
            tqdm(total=len(train_loader.dataset)) as progress_bar:
            for features, y, _, in train_loader: 
                batch_size = features.shape[0]     
                
                # Setup for forward
                features = features.view(-1, 3, 224, 224) # merge number of dense samples with batch size
                features = features.to(device)
                y = y.view(-1) # merge number of dense samples with batch size
                y = y.to(device)
                
                # Zero out optimizer first
                optimizer.zero_grad()
                
                # Forward
                logits = model(features)
                loss = loss_fn(logits, y)
                loss_val = loss.item()
                
                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()                

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         loss=loss_val,
                                         lr=optimizer.param_groups[0]['lr'])
                if cross_val:
                    tbx.add_scalar('fold{}/train/Loss'.format(fold_idx), loss_val, step)
                    tbx.add_scalar('fold{}/train/LR'.format(fold_idx),
                                   optimizer.param_groups[0]['lr'],
                                   step)
                else:
                    tbx.add_scalar('train/Loss', loss_val, step)
                    tbx.add_scalar('train/LR',
                                   optimizer.param_groups[0]['lr'],
                                   step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps
                    
                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    eval_results = evaluate_fold(model,                                            
                                                args,
                                                save_dir,
                                                device,
                                                cross_val=cross_val,
                                                fold_idx=fold_idx,
                                                is_test=False,
                                                write_outputs=False)
                    best_path = saver.save(step, model, eval_results[args.metric_name], device, eval_results)
                    
                    # Back to train mode
                    model.train()

                    # Log to console
                    results_str = ', '.join('{}: {}'.format(k, v)
                                            for k, v in eval_results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in eval_results.items():
                        if cross_val:
                            tbx.add_scalar('fold{}/eval/{}'.format(fold_idx,k), v, step)
                        else:
                            tbx.add_scalar('eval/{}'.format(k), v, step)
        
        # step lr scheduler
        scheduler.step()
        
    return best_path
Пример #9
0
class HotDogTrainer(object):
    def __init__(self):
        super(HotDogTrainer, self).__init__()
        self.model = NaiveDLClassifier()
        self.epoch = epoch
        self.data = HotDogDataSetLoader()
        self.gpu_ids = GPUS_LIST
        self.load_model_path = model_path
        self.stat_cache = None
        self.global_step = 9
        self.writer = SummaryWriter()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.0001)
        self.scheduler = CosineAnnealingLR(self.optimizer,
                                           len(self.data.train()))
        self.device = torch.device("cuda:0" if GPUS_LIST else "cpu")
        self.loss = torch.nn.CrossEntropyLoss()

    def initialize(self):
        if GPUS_LIST:
            self.model.to(self.device)
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=self.gpu_ids)
        if self.load_model_path:
            if os.path.exists(self.load_model_path):
                self.load_old_best()

    def savemodel(self, metrics):
        import json
        with open(metrics_path, 'w') as f:
            json.dump(metrics, f)
        if GPUS_LIST:
            torch.save(self.model.module.state_dict(), self.load_model_path)
        else:
            torch.save(self.model.state_dict(), self.load_model_path)

    def train(self, nb_epoch):
        trainstream = tqdm(self.data.train())
        self.avg_loss = AverageMeter()
        self.avg_acc = AverageMeter()
        self.model.train()

        for i, data in enumerate(trainstream):
            self.global_step += 1
            trainstream.set_description("TRAINING")

            x = data['image'].to(self.device)
            y = data['label'].to(self.device)

            with torch.set_grad_enabled(True):
                y_ = self.model(x)
                out_labels = torch.max(y_, 1)[1]
                loss = self.loss(y_, y)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                acc = 100. * (
                    (y.int().flatten() == out_labels.int().flatten()).sum() /
                    y.size(0))
                self.avg_acc.update(acc.item())
                self.avg_loss.update(loss.item())

                self.writer.add_scalar('Loss/Train', self.avg_loss.avg,
                                       self.global_step)
                self.writer.add_scalar('Accuracy/Train', self.avg_acc.avg,
                                       self.global_step)

                trainstream.set_postfix({
                    'epoch': nb_epoch,
                    'loss': self.avg_loss.avg,
                    'accuracy': self.avg_acc.avg
                })
        self.scheduler.step()
        trainstream.close()
        self.test(nb_epoch)

    def test(self, nb_epoch):
        self.model.eval()
        teststream = tqdm(self.data.test())

        self.avg_loss = AverageMeter()
        self.avg_acc = AverageMeter()

        teststream.set_description('TESTING')
        with torch.no_grad():
            for i, data in enumerate(teststream):
                x = data['image']
                y = data['label']
                y_ = self.model(x)
                loss = self.loss(y_, y)
                out_labels = torch.max(y_, 1)[1]
                acc = 100. * (
                    (y.int().flatten() == out_labels.int().flatten()).sum() /
                    y.size(0))
                self.avg_acc.update(acc.item())

                self.avg_loss.update(loss.item())

                teststream.set_postfix({
                    'epoch': nb_epoch,
                    'loss': self.avg_loss.avg,
                    'accuracy': self.avg_acc.avg
                })

        self.writer.add_scalar('Loss/Test', self.avg_loss.avg, nb_epoch)
        self.writer.add_scalar('Accuracy/Test', self.avg_acc.avg, nb_epoch)

        if not self.stat_cache:
            self.stat_cache = {'best': self.avg_acc.avg}
            print('SAVING MODEL')
            self.savemodel({'best': self.avg_acc.avg})
        else:
            if self.stat_cache['best'] < self.avg_acc.avg:
                print('LOADING BEST MODEL')
                self.load_old_best()

    def load_old_best(self):
        import json
        with open(metrics_path, 'r') as f:
            self.stat_cache = json.load(f)

        if GPUS_LIST:
            self.model.module.load_state_dict(torch.load(self.load_model_path))
        else:
            self.model.load_state_dict(torch.load(self.load_model_path))

    def run(self):
        self.initialize()
        for i in range(self.epoch):
            self.train(i)
        self.writer.close()
Пример #10
0
    def setup_and_start_training(self):
        logging.basicConfig(
            stream=sys.stdout,
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

        DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and self.
                              system_dict["params"]["use_cuda"] else "cpu")

        if self.system_dict["params"]["use_cuda"] and torch.cuda.is_available(
        ):
            torch.backends.cudnn.benchmark = True
            logging.info("Using gpu.")
        else:
            logging.info("Using cpu.")

        timer = Timer()
        logging.info(self.system_dict)

        if self.system_dict["params"]["net"] == 'vgg16-ssd':
            create_net = create_vgg_ssd
            config = vgg_ssd_config
        elif self.system_dict["params"]["net"] == 'mb1-ssd':
            create_net = create_mobilenetv1_ssd
            config = mobilenetv1_ssd_config
        elif self.system_dict["params"]["net"] == 'mb1-ssd-lite':
            create_net = create_mobilenetv1_ssd_lite
            config = mobilenetv1_ssd_config
        elif self.system_dict["params"]["net"] == 'sq-ssd-lite':
            create_net = create_squeezenet_ssd_lite
            config = squeezenet_ssd_config
        elif self.system_dict["params"]["net"] == 'mb2-ssd-lite':
            create_net = lambda num: create_mobilenetv2_ssd_lite(
                num, width_mult=self.system_dict["params"]["mb2_width_mult"])
            config = mobilenetv1_ssd_config
        else:
            logging.fatal("The net type is wrong.")
            sys.exit(1)

        train_transform = TrainAugmentation(config.image_size,
                                            config.image_mean,
                                            config.image_std)
        target_transform = MatchPrior(config.priors, config.center_variance,
                                      config.size_variance, 0.5)

        test_transform = TestTransform(config.image_size, config.image_mean,
                                       config.image_std)

        logging.info("Prepare training datasets.")
        datasets = []
        dataset = VOCDataset(
            self.system_dict["dataset"]["val"]["img_dir"],
            self.system_dict["dataset"]["val"]["label_dir"],
            transform=train_transform,
            target_transform=target_transform,
            label_file=self.system_dict["params"]["label_file"])
        label_file = self.system_dict["params"]["label_file"]
        #store_labels(label_file, dataset.class_names)
        num_classes = len(dataset.class_names)
        datasets.append(dataset)
        logging.info(f"Stored labels into file {label_file}.")
        train_dataset = ConcatDataset(datasets)
        logging.info("Train dataset size: {}".format(len(train_dataset)))
        train_loader = DataLoader(
            train_dataset,
            self.system_dict["params"]["batch_size"],
            num_workers=self.system_dict["params"]["num_workers"],
            shuffle=True)

        if (self.system_dict["dataset"]["val"]["status"]):
            val_dataset = VOCDataset(
                self.system_dict["dataset"]["val"]["img_dir"],
                self.system_dict["dataset"]["val"]["label_dir"],
                transform=test_transform,
                target_transform=target_transform,
                is_test=True,
                label_file=self.system_dict["params"]["label_file"])
            logging.info("validation dataset size: {}".format(
                len(val_dataset)))
            val_loader = DataLoader(
                val_dataset,
                self.system_dict["params"]["batch_size"],
                num_workers=self.system_dict["params"]["num_workers"],
                shuffle=False)

        logging.info("Build network.")
        net = create_net(num_classes)
        min_loss = -10000.0
        last_epoch = -1

        base_net_lr = self.system_dict["params"][
            "base_net_lr"] if self.system_dict["params"][
                "base_net_lr"] is not None else self.system_dict["params"]["lr"]
        extra_layers_lr = self.system_dict["params"][
            "extra_layers_lr"] if self.system_dict["params"][
                "extra_layers_lr"] is not None else self.system_dict["params"][
                    "lr"]

        if self.system_dict["params"]["freeze_base_net"]:
            logging.info("Freeze base net.")
            freeze_net_layers(net.base_net)
            params = itertools.chain(net.source_layer_add_ons.parameters(),
                                     net.extras.parameters(),
                                     net.regression_headers.parameters(),
                                     net.classification_headers.parameters())
            params = [{
                'params':
                itertools.chain(net.source_layer_add_ons.parameters(),
                                net.extras.parameters()),
                'lr':
                extra_layers_lr
            }, {
                'params':
                itertools.chain(net.regression_headers.parameters(),
                                net.classification_headers.parameters())
            }]
        elif self.system_dict["params"]["freeze_net"]:
            freeze_net_layers(net.base_net)
            freeze_net_layers(net.source_layer_add_ons)
            freeze_net_layers(net.extras)
            params = itertools.chain(net.regression_headers.parameters(),
                                     net.classification_headers.parameters())
            logging.info("Freeze all the layers except prediction heads.")
        else:
            params = [{
                'params': net.base_net.parameters(),
                'lr': base_net_lr
            }, {
                'params':
                itertools.chain(net.source_layer_add_ons.parameters(),
                                net.extras.parameters()),
                'lr':
                extra_layers_lr
            }, {
                'params':
                itertools.chain(net.regression_headers.parameters(),
                                net.classification_headers.parameters())
            }]

        timer.start("Load Model")
        resume = self.system_dict["params"]["resume"]
        base_net = self.system_dict["params"]["base_net"]
        pretrained_ssd = self.system_dict["params"]["pretrained_ssd"]
        if self.system_dict["params"]["resume"]:
            logging.info(f"Resume from the model {resume}")
            net.load(self.system_dict["params"]["resume"])
        elif self.system_dict["params"]["base_net"]:
            logging.info(f"Init from base net {base_net}")
            net.init_from_base_net(self.system_dict["params"]["base_net"])
        elif self.system_dict["params"]["pretrained_ssd"]:
            logging.info(f"Init from pretrained ssd {pretrained_ssd}")
            net.init_from_pretrained_ssd(
                self.system_dict["params"]["pretrained_ssd"])
        logging.info(
            f'Took {timer.end("Load Model"):.2f} seconds to load the model.')

        net.to(DEVICE)

        criterion = MultiboxLoss(config.priors,
                                 iou_threshold=0.5,
                                 neg_pos_ratio=3,
                                 center_variance=0.1,
                                 size_variance=0.2,
                                 device=DEVICE)
        optimizer = torch.optim.SGD(
            params,
            lr=self.system_dict["params"]["lr"],
            momentum=self.system_dict["params"]["momentum"],
            weight_decay=self.system_dict["params"]["weight_decay"])
        lr = self.system_dict["params"]["lr"]
        logging.info(
            f"Learning rate: {lr}, Base net learning rate: {base_net_lr}, " +
            f"Extra Layers learning rate: {extra_layers_lr}.")

        if (not self.system_dict["params"]["milestones"]):
            self.system_dict["params"]["milestones"] = ""
            self.system_dict["params"]["milestones"] += str(
                int(self.system_dict["params"]["num_epochs"] / 3)) + ","
            self.system_dict["params"]["milestones"] += str(
                int(2 * self.system_dict["params"]["num_epochs"] / 3))

        if self.system_dict["params"]["scheduler"] == 'multi-step':
            logging.info("Uses MultiStepLR scheduler.")
            milestones = [
                int(v.strip())
                for v in self.system_dict["params"]["milestones"].split(",")
            ]
            scheduler = MultiStepLR(optimizer,
                                    milestones=milestones,
                                    gamma=0.1,
                                    last_epoch=last_epoch)
        elif self.system_dict["params"]["scheduler"] == 'cosine':
            logging.info("Uses CosineAnnealingLR scheduler.")
            scheduler = CosineAnnealingLR(optimizer,
                                          self.system_dict["params"]["t_max"],
                                          last_epoch=last_epoch)

        logging.info(f"Start training from epoch {last_epoch + 1}.")
        for epoch in range(last_epoch + 1,
                           self.system_dict["params"]["num_epochs"]):
            scheduler.step()
            self.base_train(
                train_loader,
                net,
                criterion,
                optimizer,
                device=DEVICE,
                debug_steps=self.system_dict["params"]["debug_steps"],
                epoch=epoch)

            if ((self.system_dict["dataset"]["val"]["status"]) and
                (epoch % self.system_dict["params"]["validation_epochs"] == 0
                 or epoch == self.system_dict["params"]["num_epochs"] - 1)):
                val_loss, val_regression_loss, val_classification_loss = self.base_test(
                    val_loader, net, criterion, DEVICE)
                logging.info(
                    f"Epoch: {epoch}, " +
                    f"Validation Loss: {val_loss:.4f}, " +
                    f"Validation Regression Loss {val_regression_loss:.4f}, " +
                    f"Validation Classification Loss: {val_classification_loss:.4f}"
                )
                net_name = self.system_dict["params"]["net"]
                model_path = os.path.join(
                    self.system_dict["params"]["checkpoint_folder"],
                    f"{net_name}-Epoch-{epoch}-Loss-{val_loss}.pth")
                net.save(model_path)
                logging.info(f"Saved model {model_path}")
            if (not self.system_dict["dataset"]["val"]["status"]):
                model_path = os.path.join(
                    self.system_dict["params"]["checkpoint_folder"],
                    f"{net_name}-Epoch-{epoch}.pth")
                net.save(model_path)
                logging.info(f"Saved model {model_path}")
Пример #11
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    cel = nn.CrossEntropyLoss()
    criterion = lambda pred, target, lam: (-F.log_softmax(pred, dim=1) * torch.zeros(pred.size()).cuda().scatter_(1, target.data.view(-1, 1), lam.view(-1, 1))).sum(dim=1).mean()
    parameters_bias = [p[1] for p in model.named_parameters() if 'bias' in p[0]]
    parameters_scale = [p[1] for p in model.named_parameters() if 'scale' in p[0]]
    parameters_others = [p[1] for p in model.named_parameters() if not ('bias' in p[0] or 'scale' in p[0])]
    optimizer = torch.optim.SGD(
        [{'params': parameters_bias, 'lr': args.base_lr/10.},
        {'params': parameters_scale, 'lr': args.base_lr/10.},
        {'params': parameters_others}],
        lr=base_learning_rate,
        momentum=args.momentum,
        weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    sgdr = CosineAnnealingLR(optimizer, args.epochs, eta_min=0, last_epoch=-1)
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, cel, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        
        # save checkpoint for every epoch
        save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best, '{0}-checkpoint-{1}.pth.tar'.format(args.arch,epoch + 1))

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer' : optimizer.state_dict(),
            }, is_best)
Пример #12
0
        xargs.data_path + f'/{xargs.dataset}-split.txt', xargs.batch_size,
        xargs.workers)
    search_model = NASNetworkGDAS(xargs.channel, xargs.num_cells, xargs.steps,
                                  xargs.multiplier, xargs.stem_multiplier,
                                  xargs.num_classes, xargs.space, xargs.affine,
                                  xargs.track_running_stats,
                                  xargs.fix_reduction, xargs.paper_arch,
                                  xargs.no_gumbel)
    criterion = torch.nn.CrossEntropyLoss()
    w_optimizer = torch.optim.SGD(search_model.get_weights(),
                                  xargs.LR,
                                  momentum=xargs.momentum,
                                  weight_decay=xargs.decay,
                                  nesterov=xargs.nesterov)
    w_scheduler = CosineAnnealingLR(w_optimizer,
                                    T_max=xargs.epochs,
                                    eta_min=xargs.eta_min)
    a_optimizer = torch.optim.Adam(search_model.get_alphas(),
                                   lr=xargs.arch_learning_rate,
                                   betas=(0.5, 0.999),
                                   weight_decay=xargs.arch_weight_decay)
    network, criterion = search_model.cuda(), criterion.cuda()
    # save directories
    os.mkdir(xargs.save_dir + '/checkpoint/')
    model_base_path = xargs.save_dir + f'/checkpoint/seed-{xargs.rand_seed}-basic.pth'
    model_best_path = xargs.save_dir + f'/checkpoint/seed-{xargs.rand_seed}-best.pth'

    best_val_acc = -1
    genotypes = {-1: search_model.genotype()}
    if xargs.mixed_prec:
        network, [w_optimizer,
def train(args, io):
    train_loader = DataLoader(ModelNet40(partition='train', num_points=args.num_points), num_workers=8,
                              batch_size=args.batch_size, shuffle=True, drop_last=True,pin_memory=False)
    test_loader = DataLoader(ModelNet40(partition='test', num_points=args.num_points), num_workers=8,
                             batch_size=args.test_batch_size, shuffle=True, drop_last=False,pin_memory=False)

    device =torch.device("cuda:0" if use_cuda else "cpu")    #torch.device("cuda" if args.cuda else "cpu")

    #Try to load models
    if args.model == 'pointnet':
        model = PointNet(args).to(device)
    elif args.model == 'dgcnn':
        model = DGCNN(args).to(device)
    else:
        raise Exception("Not implemented")
    print(str(model))

    model = nn.DataParallel(model)
    print("Let's use", torch.cuda.device_count(), "GPUs!")

    if args.use_sgd:
        print("Use SGD")
        opt = optim.SGD(model.parameters(), lr=args.lr*100, momentum=args.momentum, weight_decay=1e-4)
    else:
        print("Use Adam")
        opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

    scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr)
    
    criterion = cal_loss

    best_test_acc = 0
    for epoch in range(args.epochs):
        scheduler.step()
        ####################
        # Train
        ####################
        train_loss = 0.0
        count = 0.0
        model.train()
        train_pred = []
        train_true = []
        for data, label in train_loader:
            data, label = data.to(device), label.to(device).squeeze()
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            opt.zero_grad()
            logits = model(data)
            loss = criterion(logits, label)
            loss.backward()
            opt.step()
            preds = logits.max(dim=1)[1]
            count += batch_size
            train_loss += loss.item() * batch_size
            train_true.append(label.cpu().numpy())
            train_pred.append(preds.detach().cpu().numpy())
        train_true = np.concatenate(train_true)
        train_pred = np.concatenate(train_pred)
        outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f' % (epoch,
                                                                                 train_loss*1.0/count,
                                                                                 metrics.accuracy_score(
                                                                                     train_true, train_pred),
                                                                                 metrics.balanced_accuracy_score(
                                                                                     train_true, train_pred))
        io.cprint(outstr)

        ####################
        # Test
        ####################
        test_loss = 0.0
        count = 0.0
        model.eval()
        test_pred = []
        test_true = []
        for data, label in test_loader:
            data, label = data.to(device), label.to(device).squeeze()
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            logits = model(data)
            loss = criterion(logits, label)
            preds = logits.max(dim=1)[1]
            count += batch_size
            test_loss += loss.item() * batch_size
            test_true.append(label.cpu().numpy())
            test_pred.append(preds.detach().cpu().numpy())
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)
        test_acc = metrics.accuracy_score(test_true, test_pred)
        avg_per_class_acc = metrics.balanced_accuracy_score(test_true, test_pred)
        outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f' % (epoch,
                                                                              test_loss*1.0/count,
                                                                              test_acc,
                                                                              avg_per_class_acc)
        io.cprint(outstr)
        if test_acc >= best_test_acc:
            best_test_acc = test_acc
            torch.save(model.state_dict(), 'checkpoints/%s/models/model.t7' % args.exp_name)
# optimizer = Adas(model.parameters())
lr_start = 1e-4
lr_end = 1e-6
weight_decay = 0
epoch_num = 20
wandb.config.lr_start = lr_start
wandb.config.lr_end = lr_end
wandb.config.weight_decay = weight_decay
wandb.config.epoch_num = epoch_num
wandb.config.optimizer = 'adam'
wandb.config.scheduler = 'CosineAnnealingLR'

# optimizer = Adam(group_weight(model, weight_decay=weight_decay), lr=lr_start, weight_decay=0)
optimizer = Adam(model.parameters(), lr=lr_start, weight_decay=0)
scheduler = CosineAnnealingLR(optimizer,
                              T_max=epoch_num,
                              eta_min=lr_end,
                              last_epoch=-1)
model = model.to(device)
max_val_auc = 0

for epoch in range(epoch_num):
    train_loss, train_avg_auc, train_auc, train_rocs, train_data_pr, train_duration = one_epoch_train(
        model,
        train_loader,
        optimizer,
        criterion,
        device,
        scaler,
        iters_to_accumulate=accumulation_step,
        clip_grads=False)
    val_loss, val_avg_auc, val_auc, val_rocs, val_data_pr, val_duration = eval_model(
def main():
    with timer('load data'):
        df = pd.read_csv(FOLD_PATH)

    with timer('preprocessing'):
        train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID]

        train_augmentation = Compose([
            Flip(p=0.5),
            OneOf([
                #ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03),
                GridDistortion(p=0.5),
                OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5)
            ], p=0.5),
            #OneOf([
            #    ShiftScaleRotate(p=0.5),
            ##    RandomRotate90(p=0.5),
            #    Rotate(p=0.5)
            #], p=0.5),
            OneOf([
                Blur(blur_limit=8, p=0.5),
                MotionBlur(blur_limit=8,p=0.5),
                MedianBlur(blur_limit=8,p=0.5),
                GaussianBlur(blur_limit=8,p=0.5)
            ], p=0.5),
            OneOf([
                #CLAHE(clip_limit=4, tile_grid_size=(4, 4), p=0.5),
                RandomGamma(gamma_limit=(100,140), p=0.5),
                RandomBrightnessContrast(p=0.5),
                RandomBrightness(p=0.5),
                RandomContrast(p=0.5)
            ], p=0.5),
            OneOf([
                GaussNoise(p=0.5),
                Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5)
            ], p=0.5)
        ])
        train_augmentation = Compose([
            Flip(p=0.5)
        ])
        val_augmentation = None

        train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS,
                                    transforms=train_augmentation)
        val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS,
                                  transforms=val_augmentation)
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

        del train_df, val_df, df, train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        model = smp.Linknet('se_resnext101_32x4d', encoder_weights='imagenet', classes=N_CLASSES, encoder_se_module=True,
                         decoder_semodule=True, h_columns=False)
        model.load_state_dict(torch.load(model_path))
        model.to(device)

        #criterion = torch.nn.BCEWithLogitsLoss()
        criterion = FocalLovaszLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
        scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5)
        #scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=CLR_CYCLE*2, after_scheduler=scheduler_cosine)

        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    with timer('train'):
        train_losses = []
        valid_losses = []

        best_model_loss = 999
        best_model_ep = 0
        checkpoint = 0

        for epoch in range(1, EPOCHS + 1):
            if epoch % (CLR_CYCLE * 2) == 0:
                if epoch != 0:
                    y_val = y_val.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1])
                    best_pred = best_pred.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1])
                    for i in range(N_CLASSES):
                        th, score, _, _ = search_threshold(y_val[:, i, :, :], best_pred[:, i, :, :])
                        LOGGER.info('Best loss: {} Best Dice: {} on epoch {} th {} class {}'.format(
                            round(best_model_loss, 5), round(score, 5), best_model_ep, th, i))
                checkpoint += 1
                best_model_loss = 999

            LOGGER.info("Starting {} epoch...".format(epoch))
            tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
            train_losses.append(tr_loss)
            LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5)))

            valid_loss, val_pred, y_val = validate(model, val_loader, criterion, device)
            valid_losses.append(valid_loss)
            LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5)))

            scheduler.step()

            if valid_loss < best_model_loss:
                torch.save(model.state_dict(), '{}_fold{}_ckpt{}.pth'.format(EXP_ID, FOLD_ID, checkpoint))
                best_model_loss = valid_loss
                best_model_ep = epoch
                best_pred = val_pred

            del val_pred
            gc.collect()

    with timer('eval'):
        y_val = y_val.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1])
        best_pred = best_pred.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1])
        for i in range(N_CLASSES):
            th, score, _, _ = search_threshold(y_val[:, i, :, :], best_pred[:, i, :, :])
            LOGGER.info('Best loss: {} Best Dice: {} on epoch {} th {} class {}'.format(
                round(best_model_loss, 5), round(score, 5), best_model_ep, th, i))

    xs = list(range(1, len(train_losses) + 1))
    plt.plot(xs, train_losses, label='Train loss')
    plt.plot(xs, valid_losses, label='Val loss')
    plt.legend()
    plt.xticks(xs)
    plt.xlabel('Epochs')
    plt.savefig("loss.png")
Пример #16
0
def main():
    global best_prec1, args

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()

    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.sync_bn:
        import apex
        print("using apex synced BN")
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda()

    # Scale learning rate based on global batch size
    args.lr = args.lr * float(args.batch_size * args.world_size) / 256.
    print('learning rate: ', args.lr)
    param = model.parameters()
    optimizer = torch.optim.SGD(param,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    if args.lr_adjust_type == 'step':
        scheduler = MultiStepLR(optimizer,
                                milestones=args.lr_adjust_step,
                                gamma=0.1)
    elif args.lr_adjust_type == 'cosine':
        scheduler = CosineAnnealingLR(optimizer, args.epochs)
    elif args.lr_adjust_type == 'exp':
        scheduler = ExponentialLR(optimizer, args.gamma)

    # Initialize Amp.  Amp accepts either values or strings for the optional override arguments,
    # for convenient interoperation with argparse.
    model, optimizer = amp.initialize(
        model,
        optimizer,
        opt_level=args.opt_level,
        keep_batchnorm_fp32=args.keep_batchnorm_fp32,
        loss_scale=args.loss_scale)

    # For distributed training, wrap the model with apex.parallel.DistributedDataParallel.
    # This must be done AFTER the call to amp.initialize.  If model = DDP(model) is called
    # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter
    # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks.
    if args.distributed:
        # By default, apex.parallel.DistributedDataParallel overlaps communication with
        # computation in the backward pass.
        # model = DDP(model)
        # delay_allreduce delays all communication to the end of the backward pass.
        model = DDP(model, delay_allreduce=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    # Optionally resume from a checkpoint
    if args.resume:
        # Use a local scope to avoid dangling references
        def resume():
            if os.path.isfile(args.resume):
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(
                    args.resume,
                    map_location=lambda storage, loc: storage.cuda(args.gpu))
                args.start_epoch = checkpoint['epoch']
                best_prec1 = checkpoint['best_prec1']
                model.load_state_dict(checkpoint['state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(args.resume))

        resume()

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    if (args.arch == "inception_v3"):
        raise RuntimeError(
            "Currently, inception_v3 is not supported by this example.")
        # crop_size = 299
        # val_size = 320 # I chose this value arbitrarily, we can adjust.
    else:
        crop_size = 224
        val_size = 256

    trans = transforms.Compose([
        transforms.RandomResizedCrop(crop_size),
        transforms.RandomHorizontalFlip(),
        # transforms.ToTensor(), Too slow
        # normalize,
    ])
    train_dataset = datasets.ImageFolder(traindir, trans)
    val_dataset = datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(val_size),
            transforms.CenterCrop(crop_size),
        ]))

    train_sampler = None
    val_sampler = None
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               collate_fn=fast_collate)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             collate_fn=fast_collate)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return
    st_time = time.time()
    prec1 = 0.
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        if args.grid:
            grid.set_prob(epoch, args.st_epochs)

        # train for one epoch
        #adjust_learning_rate(scheduler, optimizer, epoch, 1, 1)
        train(train_loader, model, criterion, optimizer, epoch, scheduler)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.local_rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            print(epoch)
            print('Learning rate:', optimizer.param_groups[0]['lr'])
            print('Total Time: ' + format_time(time.time() - st_time))
            print('Remaining Time: ' +
                  format_time((time.time() - st_time) /
                              (epoch - args.start_epoch + 1) *
                              (args.epochs - epoch - 1)))
            print('Best Acc: ' + str(best_prec1))
            save_checkpoint(
                args, {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
def main():
    args = parse_arg()
    set_seed(1217)
    print(args.model)
    print('%s fold-%d...' % (args.mode, args.fold))
    args.run_root = args.run_root + '/' + args.model  #'0822_efficientnetb0_LB804'
    run_root = Path(args.run_root)

    if run_root.exists() and args.clean:
        shutil.rmtree(run_root)
    run_root.mkdir(exist_ok=True, parents=True)

    train_root = DATA_ROOT / ('images_%d' % SIZE[0])
    valid_root = train_root
    test_root = train_root

    sample_sub = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
    ss = pd.DataFrame()
    ss['Image_Label'] = sample_sub['Image_Label'].apply(
        lambda x: x.split('_')[0]).unique()
    ss['EncodedPixels'] = '1 1'

    fold_df = pd.read_csv('./files/5-folds_%d.csv' % (SIZE[0]))
    train_fold = fold_df[fold_df['fold'] != args.fold].reset_index(drop=True)
    valid_fold = fold_df[fold_df['fold'] == args.fold].reset_index(drop=True)

    if args.pos_only and args.cls_label in [0, 1, 2, 3]:
        train_fold['flag'] = train_fold['Labels'].apply(
            lambda x: 1 if str(args.cls_label) in x else 0)
        valid_fold['flag'] = valid_fold['Labels'].apply(
            lambda x: 1 if str(args.cls_label) in x else 0)
        train_fold = train_fold[train_fold['flag'] == 1].reset_index(drop=True)
        valid_fold = valid_fold[valid_fold['flag'] == 1].reset_index(drop=True)
    PIXEL_THRESHOLDS = args.pixel
    AREA_SIZES = args.area

    if args.pl == 1:  # add puesdo label
        df_pl = pd.read_csv('./files/df_pl.csv')
        for col in train_fold.columns:
            if col not in df_pl.columns:
                df_pl[col] = 0
        train_fold = train_fold.append(df_pl)
        train_fold.fillna('', inplace=True)

    if args.limit:
        train_fold = train_fold[:args.limit]
        valid_fold = valid_fold[:args.limit]

    if args.sliding:
        train_transform = transform_train_al((256, 256))
    else:
        train_transform = transform_train_al(SIZE)
    test_transform = transform_test_al(SIZE)

    # model_name = args.model if '-' not in args.model else args.model.split('-')[0]
    # if model_name.startswith('effi'):
    #     model_name = model_name[:-2] + '-' + model_name[-2:]
    # #model = model_steel(model_name, pretrained=True, down=False)
    # if model_name == 'resnext101_32x16d':
    #     encoder_weights = 'instagram'
    # else:
    #     encoder_weights = 'imagenet'
    # if args.framework == 'Unet':
    #     model = smp.Unet(model_name, classes=args.n_classes, encoder_weights=encoder_weights, activation=None)
    # elif args.framework == 'FPN':
    #     model = smp.FPN(model_name, classes=args.n_classes, encoder_weights=encoder_weights, activation=None)
    # elif args.framework == 'JPU':
    #     model = model_cloud_JPU(model_name, classes=args.n_classes, encoder_weights=encoder_weights, activation=None)
    # elif '_' in args.framework:
    #     framework = args.framework.split('_')[0]
    #     model = model_cloud_smp(framework, model_name, classes=args.n_classes, pretrained=True)
    # else:
    #     raise RuntimeError('Framework %s not implemented.' % (args.framework))
    model = get_model(args)

    if args.mode == 'train':
        (run_root / 'params.json').write_text(
            json.dumps(vars(args), indent=4, sort_keys=True))

        training_set = Dataset_cloud(train_root,
                                     df=train_fold,
                                     transform=train_transform,
                                     mode='train',
                                     cls_label=args.cls_label)
        #sampler = EmptySampler(data_source=training_set, positive_ratio_range=sampler_ratio, epochs=args.n_epochs)
        validation_set = Dataset_cloud(train_root,
                                       df=valid_fold,
                                       transform=test_transform,
                                       mode='train',
                                       cls_label=args.cls_label)

        print(f'{len(training_set):,} items in train, ',
              f'{len(validation_set):,} in valid')

        train_loader = DataLoader(
            training_set,
            batch_size=args.batch_size,
            num_workers=args.workers,
            sampler=None,
            drop_last=False,
            shuffle=True,
        )
        valid_loader = DataLoader(
            validation_set,
            shuffle=False,
            batch_size=args.batch_size,
            #collate_fn=null_collate,
            num_workers=args.workers)

        model = model.cuda()

        #optimizer = Adam([{'params': model.encoder.parameters(), 'lr': args.lr},
        #                  {'params': model.decoder.parameters(), 'lr': args.lr*10}])
        optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()),
                         lr=args.lr,
                         weight_decay=0,
                         betas=(0.9, 0.999),
                         eps=1e-08)
        if args.lrc == 'reduceLR':
            scheduler = ReduceLROnPlateau(optimizer,
                                          patience=args.patience,
                                          factor=args.gamma,
                                          verbose=True,
                                          mode='max')
        elif args.lrc == 'cos':
            scheduler = CosineAnnealingLR(optimizer,
                                          args.patience,
                                          eta_min=args.lr * args.gamma)
        elif args.lrc == 'warmRestart':
            scheduler = WarmRestart(optimizer,
                                    T_max=args.patience,
                                    T_mult=1,
                                    eta_min=1e-6)
        elif args.lrc == '':
            scheduler = None


#        scheduler = StepLR(optimizer, step_size=args.patience, gamma=args.gamma)
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)
        #
        train_kwargs = dict(
            args=args,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            train_loader=train_loader,
            valid_loader=valid_loader,
            # use_cuda=use_cuda,
            epoch_length=len(training_set),
        )

        train(n_epochs=args.n_epochs, **train_kwargs)

        file = '%s/train-%d.log' % (args.run_root, args.fold)
        df = pd.read_csv(file, sep='|')
        cols = df.columns
        df.columns = [x.strip() for x in cols]
        fig, ax = plt.subplots(2, 2, figsize=(12, 12))
        #loss profile
        ax[0, 0].plot(df.epoch, df.loss, label='train-loss', marker='o')
        ax[0, 0].plot(df.epoch, df['val loss'], label='val-loss', marker='x')
        ax[0, 0].set_xlabel('epoch')
        ax[0, 0].set_ylabel('loss')
        ax[0, 0].legend()
        #lr profile
        ax[0, 1].plot(df.epoch, df.lr, label='lr', marker='o')
        ax[0, 1].set_xlabel('epoch')
        ax[0, 1].set_ylabel('lr')
        ax[0, 1].legend()
        if 'AUC-mean' in df.columns:  #cls
            ax[1, 0].plot(df.epoch, df['AUC-mean'], '-ro', label='AUC-mean')
            ax[1, 0].set_xlabel('epoch')
            ax[1, 0].set_ylabel('AUC-mean')
            ax[1, 0].legend()
            for k in range(4):
                ax[1, 1].plot(df.epoch,
                              df['class%d' % (k + 1)],
                              '-o',
                              label=CLASS_NAMES[k])
            ax[1, 1].set_xlabel('epoch')
            ax[1, 1].set_ylabel('AUC')
            ax[1, 1].legend()
        else:
            ax[1, 0].plot(df.epoch, df['val dice'], '-ro', label='dice')
            ax[1, 0].set_xlabel('epoch')
            ax[1, 0].set_ylabel('val-dice')
            ax[1, 0].legend()
        fig.savefig(Path(args.run_root) / ('train_%d.png' % (args.fold)))

    elif args.mode.startswith('predict'):
        if (run_root / ('best-dice-%d.pt' % args.fold)).exists():
            load_model(model,
                       run_root / ('best-dice-%d.pt' % args.fold),
                       multi2single=False)
        else:
            load_model(model,
                       run_root / ('best-model-%d.pt' % args.fold),
                       multi2single=False)
        model = model.cuda()
        if args.mode == 'predict_valid':
            valid_set = Dataset_cloud(valid_root,
                                      df=valid_fold,
                                      transform=test_transform,
                                      mode='test',
                                      cls_label=args.cls_label)
            valid_loader = DataLoader(valid_set,
                                      shuffle=False,
                                      batch_size=args.batch_size,
                                      num_workers=args.workers)
            predict(model,
                    args.mode,
                    loader=valid_loader,
                    out_path=run_root,
                    fold=args.fold,
                    tta=args.tta,
                    args=args)

        elif args.mode == 'predict_test':
            if args.limit:
                ss = ss[:args.limit]
            test_set = Dataset_cloud(test_root,
                                     df=ss,
                                     transform=test_transform,
                                     mode='test',
                                     cls_label=args.cls_label)
            test_loader = DataLoader(test_set,
                                     shuffle=False,
                                     batch_size=args.batch_size,
                                     num_workers=args.workers)
            predict(model,
                    args.mode,
                    loader=test_loader,
                    out_path=run_root,
                    fold=args.fold,
                    tta=args.tta,
                    args=args)
        elif args.mode == 'predict_5fold':
            if args.limit:
                ss = ss[:args.limit]
            test_set = Dataset_cloud(test_root,
                                     df=ss,
                                     transform=test_transform,
                                     mode='test')
            test_loader = DataLoader(test_set,
                                     shuffle=False,
                                     batch_size=args.batch_size,
                                     num_workers=args.workers)
            predict_5fold(test_loader,
                          out_path=run_root,
                          args=args,
                          pixel_thresholds=PIXEL_THRESHOLDS,
                          area_size=AREA_SIZES)
        else:
            RuntimeError('%s mode not implemented' % (args.mode))
    elif args.mode == 'opt':
        # creat save folder
        if (run_root / ('opt')).exists():
            pass
        else:
            output_root = Path(run_root / ('opt'))
            output_root.mkdir(exist_ok=True, parents=True)
        # Load model
        if (run_root / ('best-dice-%d.pt' % args.fold)).exists():
            load_model(model,
                       run_root / ('best-dice-%d.pt' % args.fold),
                       multi2single=False)
        else:
            load_model(model,
                       run_root / ('best-model-%d.pt' % args.fold),
                       multi2single=False)
        model = model.cuda()

        valid_set = Dataset_cloud(valid_root,
                                  df=valid_fold,
                                  transform=test_transform,
                                  mode='test')
        valid_loader = DataLoader(valid_set,
                                  shuffle=False,
                                  batch_size=args.batch_size,
                                  num_workers=args.workers)

        area_ts_list = [0, 0, 0, 0]
        for pixel_ts in range(0, 80, 5):
            pixel_ts /= 100
            pixel_ts_list = [pixel_ts] * 4
            print('Processing: pixel-[%s]' % (str(pixel_ts)))
            predict(
                model,
                args.mode,
                loader=valid_loader,
                out_path=run_root / ('opt'),
                fold=args.fold,
                tta=args.tta,
                args=args,
                pixel_thresholds=pixel_ts_list,
                area_size=area_ts_list,
            )
    else:
        print('%s mode not implemented' % (args.mode))
Пример #18
0
def main():
    args = parse_args()
    os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        device = 'cuda'
        torch.cuda.manual_seed(args.seed)
    else:
        device = 'cpu'
    print(f"==> Using device: {device}")
    if args.checkpoint is None:
        time_stamp = str(datetime.datetime.now().strftime('-%Y%m%d%H%M%S'))
        args.checkpoint = args.model + time_stamp
    args.checkpoint = 'checkpoints/' + args.checkpoint
    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)
        save_args(args)
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                        title="ModelNet" + args.model)
        logger.set_names([
            "Epoch-Num", 'Learning-Rate', 'Train-Loss', 'Train-acc-B',
            'Train-acc', 'Valid-Loss', 'Valid-acc-B', 'Valid-acc'
        ])

    print('==> Preparing data..')
    train_loader = DataLoader(ModelNet40(partition='train',
                                         num_points=args.num_points,
                                         rotation=args.aug_rotate,
                                         scale=args.aug_scale),
                              num_workers=8,
                              batch_size=args.batch_size,
                              shuffle=True,
                              drop_last=True)
    test_loader = DataLoader(ModelNet40(partition='test',
                                        num_points=args.num_points),
                             num_workers=8,
                             batch_size=args.batch_size,
                             shuffle=True,
                             drop_last=False)

    # Model
    print('==> Building model..')
    net = models.__dict__[args.model]()
    criterion = cal_loss
    net = net.to(device)
    # criterion = criterion.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.learning_rate,
                                momentum=0.9,
                                weight_decay=args.weight_decay)
    scheduler = CosineAnnealingLR(optimizer,
                                  args.epoch,
                                  eta_min=args.learning_rate / 100)

    best_test_acc = 0.  # best test accuracy
    best_train_acc = 0.
    best_test_acc_avg = 0.
    best_train_acc_avg = 0.
    best_test_loss = float("inf")
    best_train_loss = float("inf")

    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    for epoch in range(start_epoch, args.epoch):
        print('Epoch(%d/%s) Learning Rate %s:' %
              (epoch + 1, args.epoch, optimizer.param_groups[0]['lr']))
        train_out = train(net, train_loader, optimizer, criterion,
                          device)  # {"loss", "acc", "acc_avg", "time"}
        test_out = validate(net, test_loader, criterion, device)
        scheduler.step()

        if test_out["acc"] > best_test_acc:
            best_test_acc = test_out["acc"]
            is_best = True
        else:
            is_best = False

        best_test_acc = test_out["acc"] if (
            test_out["acc"] > best_test_acc) else best_test_acc
        best_train_acc = train_out["acc"] if (
            train_out["acc"] > best_train_acc) else best_train_acc
        best_test_acc_avg = test_out["acc_avg"] if (
            test_out["acc_avg"] > best_test_acc_avg) else best_test_acc_avg
        best_train_acc_avg = train_out["acc_avg"] if (
            train_out["acc_avg"] > best_train_acc_avg) else best_train_acc_avg
        best_test_loss = test_out["loss"] if (
            test_out["loss"] < best_test_loss) else best_test_loss
        best_train_loss = train_out["loss"] if (
            train_out["loss"] < best_train_loss) else best_train_loss

        save_model(net,
                   epoch,
                   path=args.checkpoint,
                   acc=test_out["acc"],
                   is_best=is_best)
        logger.append([
            epoch, optimizer.param_groups[0]['lr'], train_out["loss"],
            train_out["acc_avg"], train_out["acc"], test_out["loss"],
            test_out["acc_avg"], test_out["acc"]
        ])
        print(
            f"Training loss:{train_out['loss']} acc_avg:{train_out['acc_avg']} acc:{train_out['acc']} time:{train_out['time']}s)"
        )
        print(
            f"Testing loss:{test_out['loss']} acc_avg:{test_out['acc_avg']} acc:{test_out['acc']}% time:{test_out['time']}s) \n\n"
        )
    logger.close()

    print(f"++++++++" * 2 + "Final results" + "++++++++" * 2)
    print(
        f"++  Last Train time: {train_out['time']} | Last Test time: {test_out['time']}  ++"
    )
    print(
        f"++  Best Train loss: {best_train_loss} | Best Test loss: {best_test_loss}  ++"
    )
    print(
        f"++  Best Train acc_B: {best_train_acc_avg} | Best Test acc_B: {best_test_acc_avg}  ++"
    )
    print(
        f"++  Best Train acc: {best_train_acc} | Best Test acc: {best_test_acc}  ++"
    )
    print(f"++++++++" * 5)
def main(seed):
    with timer('load data'):
        df = pd.read_csv(FOLD_PATH)

    with timer('preprocessing'):
        train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID]

        train_augmentation = Compose([
            Flip(p=0.5),
            OneOf([
                GridDistortion(p=0.5),
                OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5)
            ],
                  p=0.5),
            OneOf([
                RandomGamma(gamma_limit=(100, 140), p=0.5),
                RandomBrightnessContrast(p=0.5),
                RandomBrightness(p=0.5),
                RandomContrast(p=0.5)
            ],
                  p=0.5),
            OneOf([
                GaussNoise(p=0.5),
                Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5)
            ],
                  p=0.5),
        ])
        val_augmentation = None

        train_dataset = SeverDataset(train_df,
                                     IMG_DIR,
                                     IMG_SIZE,
                                     N_CLASSES,
                                     id_colname=ID_COLUMNS,
                                     transforms=train_augmentation,
                                     crop_rate=1.0)
        val_dataset = SeverDataset(val_df,
                                   IMG_DIR,
                                   IMG_SIZE,
                                   N_CLASSES,
                                   id_colname=ID_COLUMNS,
                                   transforms=val_augmentation)
        train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6)
        train_loader = DataLoader(train_dataset,
                                  batch_size=BATCH_SIZE,
                                  sampler=train_sampler,
                                  num_workers=8)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=False,
                                num_workers=8)

        del train_df, val_df, df, train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        model = smp.Unet('resnet34',
                         encoder_weights="imagenet",
                         classes=N_CLASSES,
                         encoder_se_module=True,
                         decoder_semodule=True,
                         h_columns=False,
                         skip=True,
                         act="swish")
        model = convert_model(model)
        if base_model is not None:
            model.load_state_dict(torch.load(base_model))
        model.to(device)

        criterion = torch.nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
        if base_model is None:
            scheduler_cosine = CosineAnnealingLR(optimizer,
                                                 T_max=CLR_CYCLE,
                                                 eta_min=3e-5)
            scheduler = GradualWarmupScheduler(
                optimizer,
                multiplier=1.1,
                total_epoch=CLR_CYCLE * 2,
                after_scheduler=scheduler_cosine)
        else:
            scheduler = CosineAnnealingLR(optimizer,
                                          T_max=CLR_CYCLE,
                                          eta_min=3e-5)

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)
        model = torch.nn.DataParallel(model)

    with timer('train'):
        train_losses = []
        valid_losses = []

        best_model_loss = 999
        best_model_ep = 0
        checkpoint = base_ckpt + 1

        for epoch in range(1, EPOCHS + 1):
            seed = seed + epoch
            seed_torch(seed)
            if epoch % (CLR_CYCLE * 2) == 0:
                LOGGER.info('Best valid loss: {} on epoch={}'.format(
                    round(best_model_loss, 5), best_model_ep))
                checkpoint += 1
                best_model_loss = 999

            LOGGER.info("Starting {} epoch...".format(epoch))
            tr_loss = train_one_epoch(model,
                                      train_loader,
                                      criterion,
                                      optimizer,
                                      device,
                                      cutmix_prob=0.0)
            train_losses.append(tr_loss)
            LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5)))

            valid_loss = validate(model, val_loader, criterion, device)
            valid_losses.append(valid_loss)
            LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5)))

            scheduler.step()

            if valid_loss < best_model_loss:
                torch.save(
                    model.module.state_dict(),
                    'models/{}_fold{}_ckpt{}.pth'.format(
                        EXP_ID, FOLD_ID, checkpoint))
                best_model_loss = valid_loss
                best_model_ep = epoch
                #np.save("val_pred.npy", val_pred)

            torch.save(model.module.state_dict(),
                       'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID))

            #del val_pred
            gc.collect()

    LOGGER.info('Best valid loss: {} on epoch={}'.format(
        round(best_model_loss, 5), best_model_ep))

    xs = list(range(1, len(train_losses) + 1))
    plt.plot(xs, train_losses, label='Train loss')
    plt.plot(xs, valid_losses, label='Val loss')
    plt.legend()
    plt.xticks(xs)
    plt.xlabel('Epochs')
    plt.savefig("loss.png")
class FlowTrainer(object):
    def __init__(self):
        super(FlowTrainer, self).__init__()
        # not the best model...
        self.model = FlowEstimator(shape=(256, 256),
                                   use_l2=False,
                                   channel_in=3,
                                   stride=1,
                                   kernel_size=2,
                                   use_cst=True)
        self.optimizer = None
        self.lr_scheduler = None
        self.save_dir = None

        self.epoch = 1000

        self.train_loader = SintelLoader(
            batch_size=1,
            pin_memory=True,
            num_workers=8,
        )

        self.val_loader = None

        self.test_loader = SintelLoader(
            sintel_root="/data/keshav/sintel/test/final",
            batch_size=1,
            pin_memory=True,
            num_workers=8)

        self.sample_test = [
            *SintelLoader(sintel_root="/data/keshav/sintel/test/final",
                          test=True,
                          nsample=10,
                          visualize=True).load()
        ][0]
        self.sample_train = [*SintelLoader(nsample=10, visualize=True).load()
                             ][0]

        self.sample_val = None

        self.save_model_path = './best/'
        self.load_model_path = None
        self.best_metrics = {'train_loss': None, 'val_loss': None}
        self.gpu_ids = [0, 1, 2, 3, 4, 5, 6, 7]

        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.0001)
        # self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
        # self.scheduler = ReduceLROnPlateau(self.optimizer)
        self.scheduler = CosineAnnealingLR(self.optimizer,
                                           len(self.train_loader.load()))
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        self.photoloss = torch.nn.MSELoss()

        self.writer = SummaryWriter()
        self.global_step = 0

    def resetsample(self):
        self.sample_test = [
            *SintelLoader(sintel_root="/data/keshav/sintel/test/final",
                          test=True,
                          nsample=10,
                          visualize=True).load()
        ][0]
        self.sample_train = [*SintelLoader(nsample=10, visualize=True).load()
                             ][0]

    def initialize(self):
        self.model.to(self.device)
        self.model = torch.nn.DataParallel(self.model, device_ids=self.gpu_ids)
        if self.load_model_path:
            # LOAD MODEL WEIGHTS HERE
            pass
        self.initialized = True

    def savemodel(self, metrics, compare='val_loss'):
        # Save model in save_model_path
        if self.best_metrics.get('val_loss') > metrics.get('val_loss'):
            # save only if new metrics are low
            self.best_metrics.update(metrics)
            pass
        else:
            # Load from the best saved
            pass

    def train_epoch_end(self, metrics):
        self.resetsample()
        self.model.eval()
        with torch.no_grad():
            frame1 = self.sample_train['frame1'].to(self.device)
            frame2 = self.sample_train['frame2'].to(self.device)

            frame1Unet = self.sample_train['frame1Unet'].to(self.device)
            frame2Unet = self.sample_train['frame2Unet'].to(self.device)
            frame1Unet_ = self.sample_train['frame1Unet_'].to(self.device)

            flow, occ = self.model(frame1, frame2)
            flow = flow * 256.
            frame1_ = warper(flow, frame2Unet)

            occ = replicatechannel(occ)

            #without unet
            # sampocc = replicatechannel(self.sample_train['occlusion'].cuda())
            #with unet
            sampocc = replicatechannel(
                self.sample_train['occlusionUnet'].cuda())

            occs = torch.cat([sampocc, occ])
            occs = make_grid(occs, nrow=10).unsqueeze(0)

            #without unet
            # frames = torch.cat([frame1_, frame1, frame2])
            # with unet
            frames = torch.cat([frame1_, frame1Unet_, frame1Unet, frame2Unet])

            frames = make_grid(frames, nrow=10).unsqueeze(0)

            #without unet
            # flows = torch.cat([flow2rgb(flow.cpu()).cuda(), self.sample_train['flow'].cuda()])
            # with unet
            flows = torch.cat([
                flow2rgb(flow.cpu(), scaled=True).cuda(),
                self.sample_train['flowUnet'].cuda()
            ])
            flows = make_grid(flows, nrow=10).unsqueeze(0)

            self.writer.add_images('TRAIN/Frames', frames,
                                   metrics.get('nb_batch'))
            self.writer.add_images('TRAIN/Flows', flows,
                                   metrics.get('nb_batch'))
            self.writer.add_images('TRAIN/Occlusions', occs,
                                   metrics.get('nb_batch'))

        return self.val(metrics)

    def val_end(self, metrics):
        return metrics

    def test_end(self, metrics):
        with torch.no_grad():
            frame1 = self.sample_test['frame1'].to(self.device)
            frame2 = self.sample_test['frame2'].to(self.device)

            frame1Unet = self.sample_test['frame1Unet'].to(self.device)
            frame2Unet = self.sample_test['frame2Unet'].to(self.device)

            flow, occ = self.model(frame1, frame2)
            frame1_ = warper(flow, frame2Unet)
            occ = replicatechannel(occ)

            frames = torch.cat([
                frame1_, frame1Unet, frame2Unet,
                flow2rgb(flow.cpu(), scaled=True).cuda(), occ
            ])
            frames = make_grid(frames, nrow=10).unsqueeze(0)

            self.writer.add_images('TEST/Frames', frames,
                                   metrics.get('nb_batch'))
        return metrics

    def train(self, nb_epoch):
        trainstream = tqdm(self.train_loader.load())
        self.avg_loss = AverageMeter()
        self.model.train()
        for i, data in enumerate(trainstream):
            self.global_step += 1
            trainstream.set_description('TRAINING')

            # GET X and Frame 2
            # wdt = data['displacement'].to(self.device)
            frame2 = data['frame2'].to(self.device)
            frame1 = data['frame1'].to(self.device)

            frame1Unet = data['frame1Unet'].to(self.device)
            frame2Unet = data['frame2Unet'].to(self.device)

            # frame1Unet1 = data['frame1Unet1'].to(self.device)
            # frame2Unet1 = data['frame2Unet1'].to(self.device)
            #
            # frame1Unet2 = data['frame1Unet2'].to(self.device)
            # frame2Unet2 = data['frame2Unet2'].to(self.device)
            #
            # frame1Unet3 = data['frame1Unet3'].to(self.device)
            # frame2Unet3 = data['frame2Unet3'].to(self.device)

            # frame1Unet4 = data['frame1Unet4'].to(self.device)
            # frame2Unet4 = data['frame2Unet4'].to(self.device)

            # frame1Unet5 = data['frame1Unet5'].to(self.device)
            # frame2Unet5 = data['frame2Unet5'].to(self.device)
            #
            # frame1Unet6 = data['frame1Unet6'].to(self.device)
            # frame2Unet6 = data['frame2Unet6'].to(self.device)

            self.optimizer.zero_grad()

            # forward
            with torch.set_grad_enabled(True):
                # flow1, flow2, flow3, flow4, flow5, flow6, flow, occ1, occ2, occ3, occ4, occ5, occ6, occ = self.model(frame1, frame2)
                flow, occ = self.model(frame1, frame2)

                print(flow.shape)
                print(frame2Unet.shape)

                frame1_ = warper(flow, frame2Unet)
                # frame1_1 = warper(flow1, frame2Unet1)
                # frame1_2 = warper(flow2, frame2Unet2)
                # frame1_3 = warper(flow3, frame2Unet3)
                # frame1_4 = warper(flow4, frame2Unet4)
                # frame1_5 = warper(flow5, frame2Unet5)
                # frame1_6 = warper(flow6, frame2Unet6)

                loss = comboloss(frame1Unet, frame2Unet, frame1_, occ)
                # loss1_1 = comboloss(frame1Unet1, frame2Unet1, frame1_1, occ1)
                # loss1_2 = comboloss(frame1Unet2, frame2Unet2, frame1_2, occ2)
                # loss1_3 = comboloss(frame1Unet3, frame2Unet3, frame1_3, occ3)
                # loss1_4 = comboloss(frame1Unet4, frame2Unet4, frame1_4, occ4)
                # loss1_5 = comboloss(frame1Unet5, frame2Unet5, frame1_5, occ5)
                # loss1_6 = comboloss(frame1Unet6, frame2Unet6, frame1_6, occ6)

                # loss = (loss1_ + loss1_4)/2.
                # loss = (loss1_ + loss1_4 + loss1_5 + loss1_6) / 4.

                # loss = (loss1_ + loss1_1 + loss1_2 + loss1_3 + loss1_4 + loss1_5 + loss1_6) / 7.

                #WITHOUT UNET
                # loss = photometricloss(frame1, frame1_, occ)
                #WITH UNET
                # loss = photometricloss(frame1Unet, frame1_,frame2Unet, occ)
                # loss = comboloss(frame1Unet,frame2Unet,frame1_,occ)
                self.avg_loss.update(loss.item(), i + 1)
                loss.backward()
                self.optimizer.step()

                self.writer.add_scalar('Loss/train', self.avg_loss.avg,
                                       self.global_step)

                trainstream.set_postfix({
                    'epoch': nb_epoch,
                    'loss': self.avg_loss.avg
                })
        self.scheduler.step(loss)
        trainstream.close()
        return self.train_epoch_end({
            'TRloss': self.avg_loss.avg,
            'epoch': nb_epoch,
        })

    def val(self, metrics):
        if self.val_loader is None: return self.test(metrics)
        # DO VAL STUFF HERE
        valstream = tqdm(self.val_loader.load())
        for data in valstream:
            pass
        return self.val_end(metrics)

    def test(self, metrics={}):
        teststream = tqdm(self.test_loader.load())
        self.avg_loss = AverageMeter()
        with torch.no_grad():
            for i, data in enumerate(teststream):
                teststream.set_description('TESTING')
                frame2 = data['frame2'].to(self.device)
                frame1 = data['frame1'].to(self.device)

                frame2Unet = data['frame2Unet'].to(self.device)
                frame1Unet = data['frame1Unet'].to(self.device)

                flow, occ = self.model(frame1, frame2)
                frame1_ = warper(flow, frame2Unet)
                # loss = photometricloss(frame1Unet, frame1_,frame2Unet, occ)
                loss = comboloss(frame1Unet, frame2Unet, frame1_, occ)
                self.avg_loss.update(loss.item(), i + 1)
                metrics.update({'TSloss': self.avg_loss.avg})
                teststream.set_postfix(metrics)
        self.writer.add_scalar('Loss/test', self.avg_loss.avg,
                               metrics.get('epoch'))
        teststream.close()

        return self.test_end(metrics)

    def loggings(self, **metrics):
        pass

    def run(self):
        self.initialize()
        for i in range(self.epoch):
            metrics = self.train(i)
        self.test(metrics)
        self.writer.close()
Пример #21
0
print(datainfo)
print(datainfo.datasets['train'][0])

model = DPCNN(init_embed=embedding, num_cls=len(datainfo.vocabs[C.TARGET]),
              embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout)
print(model)

# 3. 声明loss,metric,optimizer
loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer = SGD([param for param in model.parameters() if param.requires_grad == True],
                lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay)

callbacks = []

callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))
# callbacks.append(
#     LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch <
#                          ops.train_epoch * 0.8 else ops.lr * 0.1))
# )

# callbacks.append(
#     FitlogCallback(data=datainfo.datasets, verbose=1)
# )

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(device)

# 4.定义train方法
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
Пример #22
0
def DelayedCosineAnnealingLR(optimizer, delay_epochs, cosine_annealing_epochs):
    base_scheduler = CosineAnnealingLR(optimizer, cosine_annealing_epochs)
    return DelayerScheduler(optimizer, delay_epochs, base_scheduler)
Пример #23
0
def main(args):
    root = 'runs_' + args.dataset
    exp = Experiment(args,
                     root=root,
                     main='model',
                     ignore=('cuda', 'device', 'epochs', 'resume'))

    print(exp)
    if os.path.exists(exp.path_to('log')) and not args.resume:
        print('Skipping ...')
        sys.exit(0)

    train_data, test_data, in_ch, out = load_dataset(args)
    train_loader = DataLoader(train_data,
                              batch_size=args.batch_size,
                              shuffle=True)
    test_loader = DataLoader(test_data,
                             batch_size=args.batch_size,
                             shuffle=False)

    if args.model == 'odenet':
        model = ODENet(in_ch,
                       out=out,
                       n_filters=args.filters,
                       downsample=args.downsample,
                       method=args.method,
                       tol=args.tol,
                       adjoint=args.adjoint,
                       dropout=args.dropout)
    else:
        model = ResNet(in_ch,
                       out=out,
                       n_filters=args.filters,
                       downsample=args.downsample,
                       dropout=args.dropout)

    model = model.to(args.device)
    if args.optim == 'sgd':
        optimizer = SGD(model.parameters(),
                        lr=args.lr,
                        momentum=0.9,
                        weight_decay=args.wd)
    elif args.optim == 'adam':
        optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)

    # print(train_data)
    # print(test_data)
    # print(model)
    # print(optimizer)

    if args.resume:
        ckpt = torch.load(exp.ckpt('last'))
        print('Loaded: {}'.format(exp.ckpt('last')))
        model.load_state_dict(ckpt['model'])
        optimizer.load_state_dict(ckpt['optim'])
        start_epoch = ckpt['epoch'] + 1
        best_accuracy = exp.log['test_acc'].max()
        print('Resuming from epoch {}: {}'.format(start_epoch, exp.name))
    else:
        metrics = evaluate(test_loader, model, args)
        best_accuracy = metrics['test_acc']
        start_epoch = 1

    if args.lrschedule == 'fixed':
        scheduler = LambdaLR(
            optimizer,
            lr_lambda=lambda x: 1)  # no-op scheduler, just for cleaner code
    elif args.lrschedule == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer,
                                      mode='max',
                                      patience=args.patience)
    elif args.lrschedule == 'cosine':
        scheduler = CosineAnnealingLR(optimizer,
                                      args.lrcycle,
                                      last_epoch=start_epoch - 2)

    progress = trange(start_epoch,
                      args.epochs + 1,
                      initial=start_epoch,
                      total=args.epochs)
    for epoch in progress:
        metrics = {'epoch': epoch}

        progress.set_postfix({'Best ACC': f'{best_accuracy:.2%}'})
        progress.set_description('TRAIN')
        train_metrics = train(train_loader, model, optimizer, args)

        progress.set_description('EVAL')
        test_metrics = evaluate(test_loader, model, args)

        is_best = test_metrics['test_acc'] > best_accuracy
        best_accuracy = max(test_metrics['test_acc'], best_accuracy)

        metrics.update(train_metrics)
        metrics.update(test_metrics)

        save_checkpoint(
            exp, {
                'epoch': epoch,
                'params': vars(args),
                'model': model.state_dict(),
                'optim': optimizer.state_dict(),
                'metrics': metrics
            }, is_best)

        exp.push_log(metrics)
        sched_args = metrics[
            'test_acc'] if args.lrschedule == 'plateau' else None
        scheduler.step(sched_args)
 def init_scheduler(self):
     # self.scheduler_name
     if self.scheduler_name == "cosine":
         self.scheduler = CosineAnnealingLR(self.optimizer, T_max=10, eta_min=1e-5)
     else:
         self.scheduler = None
Пример #25
0
def train(args, io):
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

    # sample_rate=1.5 to make sure some overlap
    train_loader = DataLoader(S3DISDataset(split='train',
                                           data_root=args.data_dir,
                                           num_point=args.num_points,
                                           test_area=args.test_area,
                                           block_size=args.block_size,
                                           sample_rate=1.5,
                                           num_class=args.num_classes),
                              num_workers=8,
                              batch_size=args.batch_size,
                              shuffle=True,
                              drop_last=True)

    test_loader = DataLoader(S3DISDataset(split='test',
                                          data_root=args.data_dir,
                                          num_point=args.num_points,
                                          test_area=args.test_area,
                                          block_size=args.block_size,
                                          sample_rate=1.5,
                                          num_class=args.num_classes),
                             num_workers=8,
                             batch_size=args.test_batch_size,
                             shuffle=True,
                             drop_last=True)

    device = torch.device("cuda" if args.cuda else "cpu")

    # Try to load models
    if args.model == 'dgcnn':
        model = DGCNN_semseg(args).to(device)
    else:
        raise Exception("Not implemented")
    print(str(model))

    model = nn.DataParallel(model)
    print("Let's use", torch.cuda.device_count(), "GPUs!")

    if args.use_sgd:
        print("Use SGD")
        opt = optim.SGD(model.parameters(),
                        lr=args.lr * 100,
                        momentum=args.momentum,
                        weight_decay=1e-4)
    else:
        print("Use Adam")
        opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

    if args.scheduler == 'cos':
        scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=1e-3)
    elif args.scheduler == 'step':
        scheduler = StepLR(opt, 20, 0.5, args.epochs)

    try:
        checkpoint = torch.load(
            os.path.join(args.model_root, 'model_%s.t7' % args.test_area))
        start_epoch = checkpoint['epoch'] + 1
        model.load_state_dict(checkpoint['model_state_dict'])
        opt.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        best_test_iou = checkpoint['mIOU']
        io.cprint('Use pretrained model')
    except:
        io.cprint('No existing model, starting training from scratch...')
        start_epoch = 0
        best_test_iou = 0

    criterion = cal_loss

    log_dir = os.path.join(BASE_DIR, args.tb_dir)
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    writer_train_loss = SummaryWriter(os.path.join(log_dir, 'train_loss'))
    writer_train_accuracy = SummaryWriter(os.path.join(log_dir))
    writer_train_iou = SummaryWriter(os.path.join(log_dir))
    writer_test_accuracy = SummaryWriter(os.path.join(log_dir))
    writer_test_iou = SummaryWriter(os.path.join(log_dir))

    for epoch in range(start_epoch, args.epochs):
        ####################
        # Train
        ####################
        train_loss = 0.0
        count = 0.0
        niter = epoch * len(train_loader) * args.batch_size
        model.train()
        train_true_cls = []
        train_pred_cls = []
        train_true_seg = []
        train_pred_seg = []
        train_label_seg = []

        io.cprint('Start training for Epoch %d ...' % epoch)
        for data, seg in tqdm(train_loader):
            data, seg = data.to(device), seg.to(device)
            data = data.permute(0, 2, 1).float()
            batch_size = data.size()[0]
            opt.zero_grad()
            seg_pred = model(data)
            seg_pred = seg_pred.permute(0, 2, 1).contiguous()
            loss = criterion(seg_pred.view(-1, args.num_classes),
                             seg.view(-1, 1).squeeze().long())
            loss.backward()
            opt.step()
            pred = seg_pred.max(dim=2)[1]  # (batch_size, num_points)
            count += batch_size
            train_loss += loss.item() * batch_size
            niter += batch_size
            writer_train_loss.add_scalar('Train/loss', loss.item(), niter)
            seg_np = seg.cpu().numpy()  # (batch_size, num_points)
            pred_np = pred.detach().cpu().numpy()  # (batch_size, num_points)
            train_true_cls.append(
                seg_np.reshape(-1))  # (batch_size * num_points)
            train_pred_cls.append(
                pred_np.reshape(-1))  # (batch_size * num_points)
            train_true_seg.append(seg_np)
            train_pred_seg.append(pred_np)
        if args.scheduler == 'cos':
            scheduler.step()
        elif args.scheduler == 'step':
            if opt.param_groups[0]['lr'] > 1e-5:
                scheduler.step()
            if opt.param_groups[0]['lr'] < 1e-5:
                for param_group in opt.param_groups:
                    param_group['lr'] = 1e-5
        train_true_cls = np.concatenate(train_true_cls)
        train_pred_cls = np.concatenate(train_pred_cls)
        train_acc = metrics.accuracy_score(train_true_cls, train_pred_cls)
        avg_per_class_acc = metrics.balanced_accuracy_score(
            train_true_cls, train_pred_cls)
        train_true_seg = np.concatenate(train_true_seg, axis=0)
        train_pred_seg = np.concatenate(train_pred_seg, axis=0)
        train_ious = calculate_sem_IoU(train_pred_seg, train_true_seg,
                                       args.num_classes)
        outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f, train iou: %.6f' % (
            epoch, train_loss * 1.0 / count, train_acc, avg_per_class_acc,
            np.mean(train_ious))
        io.cprint(outstr)
        writer_train_accuracy.add_scalar('Train/accuracy', train_acc, epoch)
        writer_train_iou.add_scalar('Train/mIOU', np.mean(train_ious), epoch)

        ####################
        # Test
        ####################
        test_loss = 0.0
        count = 0.0
        model.eval()
        test_true_cls = []
        test_pred_cls = []
        test_true_seg = []
        test_pred_seg = []

        io.cprint('Start evaluation for Epoch %d ...' % epoch)
        for data, seg in tqdm(test_loader):
            data, seg = data.to(device), seg.to(device)
            data = data.permute(0, 2, 1).float()
            batch_size = data.size()[0]
            seg_pred = model(data)
            seg_pred = seg_pred.permute(0, 2, 1).contiguous()
            loss = criterion(seg_pred.view(-1, args.num_classes),
                             seg.view(-1, 1).squeeze().long())
            pred = seg_pred.max(dim=2)[1]
            count += batch_size
            test_loss += loss.item() * batch_size
            seg_np = seg.cpu().numpy()
            pred_np = pred.detach().cpu().numpy()
            test_true_cls.append(seg_np.reshape(-1))
            test_pred_cls.append(pred_np.reshape(-1))
            test_true_seg.append(seg_np)
            test_pred_seg.append(pred_np)
        test_true_cls = np.concatenate(test_true_cls)
        test_pred_cls = np.concatenate(test_pred_cls)
        test_acc = metrics.accuracy_score(test_true_cls, test_pred_cls)
        avg_per_class_acc = metrics.balanced_accuracy_score(
            test_true_cls, test_pred_cls)
        test_true_seg = np.concatenate(test_true_seg, axis=0)
        test_pred_seg = np.concatenate(test_pred_seg, axis=0)
        test_ious = calculate_sem_IoU(test_pred_seg, test_true_seg,
                                      args.num_classes)
        outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f, test iou: %.6f' % (
            epoch, test_loss * 1.0 / count, test_acc, avg_per_class_acc,
            np.mean(test_ious))
        io.cprint(outstr)
        writer_test_accuracy.add_scalar('Test/accuracy', test_acc, epoch)
        writer_test_iou.add_scalar('Test/mIOU', np.mean(test_ious), epoch)

        if np.mean(test_ious) >= best_test_iou:
            best_test_iou = np.mean(test_ious)
            savepath = 'checkpoints/%s/models/model_%s.t7' % (args.exp_name,
                                                              args.test_area)
            io.cprint('Saving the best model at %s' % savepath)
            state = {
                'epoch': epoch,
                'mIOU': best_test_iou,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': opt.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
            }
            torch.save(state, savepath)

    writer_train_loss.close()
    writer_train_accuracy.close()
    writer_train_iou.close()
    writer_test_accuracy.close()
    writer_test_iou.close()
Пример #26
0
def train(args, io):
    train_dataset = ShapeNetPart(partition='trainval',
                                 num_points=args.num_points,
                                 class_choice=args.class_choice)
    if (len(train_dataset) < 100):
        drop_last = False
    else:
        drop_last = True
    train_loader = DataLoader(train_dataset,
                              num_workers=8,
                              batch_size=args.batch_size,
                              shuffle=True,
                              drop_last=drop_last)
    test_loader = DataLoader(ShapeNetPart(partition='test',
                                          num_points=args.num_points,
                                          class_choice=args.class_choice),
                             num_workers=8,
                             batch_size=args.test_batch_size,
                             shuffle=True,
                             drop_last=False)

    device = torch.device("cuda:0" if args.cuda else "cpu")

    #Try to load models
    seg_num_all = train_loader.dataset.seg_num_all
    seg_start_index = train_loader.dataset.seg_start_index
    if args.model == 'dgcnn':
        model = DGCNN_partseg(args, seg_num_all).to(device)
    else:
        raise Exception("Not implemented")
    #print(str(model))

    #model = nn.DataParallel(model)
    print("Let's use", str(1), "GPUs!")

    if args.use_sgd:
        print("Use SGD")
        opt = optim.SGD(model.parameters(),
                        lr=args.lr * 100,
                        momentum=args.momentum,
                        weight_decay=1e-4)
    else:
        print("Use Adam")
        opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

    if args.scheduler == 'cos':
        scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=1e-3)
    elif args.scheduler == 'step':
        scheduler = StepLR(opt, step_size=20, gamma=0.5)

    criterion = cal_loss

    best_test_iou = 0
    for epoch in range(args.epochs):
        ####################
        # Train
        ####################
        train_loss = 0.0
        count = 0.0
        model.train()
        train_true_cls = []
        train_pred_cls = []
        train_true_seg = []
        train_pred_seg = []
        train_label_seg = []
        for data, label, seg in train_loader:
            seg = seg - seg_start_index
            label_one_hot = np.zeros((label.shape[0], 16))
            for idx in range(label.shape[0]):
                label_one_hot[idx, label[idx]] = 1
            label_one_hot = torch.from_numpy(label_one_hot.astype(np.float32))
            data, label_one_hot, seg = data.to(device), label_one_hot.to(
                device), seg.to(device)
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            opt.zero_grad()
            seg_pred = model(data, label_one_hot)
            seg_pred = seg_pred.permute(0, 2, 1).contiguous()
            loss = criterion(seg_pred.view(-1, seg_num_all),
                             seg.view(-1, 1).squeeze())
            loss.backward()
            opt.step()
            pred = seg_pred.max(dim=2)[1]  # (batch_size, num_points)
            count += batch_size
            train_loss += loss.item() * batch_size
            seg_np = seg.cpu().numpy()  # (batch_size, num_points)
            pred_np = pred.detach().cpu().numpy()  # (batch_size, num_points)
            train_true_cls.append(
                seg_np.reshape(-1))  # (batch_size * num_points)
            train_pred_cls.append(
                pred_np.reshape(-1))  # (batch_size * num_points)
            train_true_seg.append(seg_np)
            train_pred_seg.append(pred_np)
            train_label_seg.append(label.reshape(-1))
        if args.scheduler == 'cos':
            scheduler.step()
        elif args.scheduler == 'step':
            if opt.param_groups[0]['lr'] > 1e-5:
                scheduler.step()
            if opt.param_groups[0]['lr'] < 1e-5:
                for param_group in opt.param_groups:
                    param_group['lr'] = 1e-5
        train_true_cls = np.concatenate(train_true_cls)
        train_pred_cls = np.concatenate(train_pred_cls)
        train_acc = metrics.accuracy_score(train_true_cls, train_pred_cls)
        avg_per_class_acc = metrics.balanced_accuracy_score(
            train_true_cls, train_pred_cls)
        train_true_seg = np.concatenate(train_true_seg, axis=0)
        train_pred_seg = np.concatenate(train_pred_seg, axis=0)
        train_label_seg = np.concatenate(train_label_seg)
        train_ious = calculate_shape_IoU(train_pred_seg, train_true_seg,
                                         train_label_seg, args.class_choice)
        outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f, train iou: %.6f' % (
            epoch, train_loss * 1.0 / count, train_acc, avg_per_class_acc,
            np.mean(train_ious))
        io.cprint(outstr)

        ####################
        # Test
        ####################
        test_loss = 0.0
        count = 0.0
        model.eval()
        test_true_cls = []
        test_pred_cls = []
        test_true_seg = []
        test_pred_seg = []
        test_label_seg = []
        for data, label, seg in test_loader:
            seg = seg - seg_start_index
            label_one_hot = np.zeros((label.shape[0], 16))
            for idx in range(label.shape[0]):
                label_one_hot[idx, label[idx]] = 1
            label_one_hot = torch.from_numpy(label_one_hot.astype(np.float32))
            data, label_one_hot, seg = data.to(device), label_one_hot.to(
                device), seg.to(device)
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            seg_pred = model(data, label_one_hot)
            seg_pred = seg_pred.permute(0, 2, 1).contiguous()
            loss = criterion(seg_pred.view(-1, seg_num_all),
                             seg.view(-1, 1).squeeze())
            pred = seg_pred.max(dim=2)[1]
            count += batch_size
            test_loss += loss.item() * batch_size
            seg_np = seg.cpu().numpy()
            pred_np = pred.detach().cpu().numpy()
            test_true_cls.append(seg_np.reshape(-1))
            test_pred_cls.append(pred_np.reshape(-1))
            test_true_seg.append(seg_np)
            test_pred_seg.append(pred_np)
            test_label_seg.append(label.reshape(-1))
        test_true_cls = np.concatenate(test_true_cls)
        test_pred_cls = np.concatenate(test_pred_cls)
        test_acc = metrics.accuracy_score(test_true_cls, test_pred_cls)
        avg_per_class_acc = metrics.balanced_accuracy_score(
            test_true_cls, test_pred_cls)
        test_true_seg = np.concatenate(test_true_seg, axis=0)
        test_pred_seg = np.concatenate(test_pred_seg, axis=0)
        test_label_seg = np.concatenate(test_label_seg)
        test_ious = calculate_shape_IoU(test_pred_seg, test_true_seg,
                                        test_label_seg, args.class_choice)
        outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f, test iou: %.6f' % (
            epoch, test_loss * 1.0 / count, test_acc, avg_per_class_acc,
            np.mean(test_ious))
        io.cprint(outstr)
        if np.mean(test_ious) >= best_test_iou:
            best_test_iou = np.mean(test_ious)
            torch.save(model.state_dict(),
                       'checkpoints/%s/models/model.t7' % args.exp_name)
Пример #27
0
    print(net.parameters)

    criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3,
                             center_variance=0.1, size_variance=0.2, device=DEVICE)
    optimizer = torch.optim.RMSprop(params, lr=0.003,weight_decay=args.weight_decay, momentum=args.momentum)
    logging.info(f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, "
                 + f"Extra Layers learning rate: {extra_layers_lr}.")

    if args.scheduler == 'multi-step':
        logging.info("Uses MultiStepLR scheduler.")
        milestones = [int(v.strip()) for v in args.milestones.split(",")]
        scheduler = MultiStepLR(optimizer, milestones=milestones,
                                gamma=0.1, last_epoch=last_epoch)
    elif args.scheduler == 'cosine':
        logging.info("Uses CosineAnnealingLR scheduler.")
        scheduler = CosineAnnealingLR(
            optimizer, args.t_max, last_epoch=last_epoch)
    else:
        logging.fatal(f"Unsupported Scheduler: {args.scheduler}.")
        parser.print_help(sys.stderr)
        sys.exit(1)

    logging.info(f"Start training from epoch {last_epoch + 1}.")
    for epoch in range(last_epoch + 1, args.num_epochs):
        train(train_loader, net, criterion, optimizer,
              device=DEVICE, debug_steps=args.debug_steps, epoch=epoch)

        scheduler.step()

        if epoch % 10 == 0:
            # val_loss, val_regression_loss, val_classification_loss = test(val_loader, net, criterion, DEVICE)
            # logging.info(
def main(seed):
    with timer('load data'):
        df = pd.read_csv(FOLD_PATH)
        y1 = (df.EncodedPixels_1 != "-1").astype("float32").values.reshape(
            -1, 1)
        y2 = (df.EncodedPixels_2 != "-1").astype("float32").values.reshape(
            -1, 1)
        y3 = (df.EncodedPixels_3 != "-1").astype("float32").values.reshape(
            -1, 1)
        y4 = (df.EncodedPixels_4 != "-1").astype("float32").values.reshape(
            -1, 1)
        y = np.concatenate([y1, y2, y3, y4], axis=1)

    with timer('preprocessing'):
        train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID]
        y_train, y_val = y[df.fold_id != FOLD_ID], y[df.fold_id == FOLD_ID]

        train_augmentation = Compose([
            Flip(p=0.5),
            OneOf([
                GridDistortion(p=0.5),
                OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5)
            ],
                  p=0.5),
            OneOf([
                RandomGamma(gamma_limit=(100, 140), p=0.5),
                RandomBrightnessContrast(p=0.5),
            ],
                  p=0.5),
            OneOf([GaussNoise(p=0.5)], p=0.5),
            ShiftScaleRotate(rotate_limit=20, p=0.5),
        ])
        val_augmentation = None

        train_dataset = SeverDataset(train_df,
                                     IMG_DIR,
                                     IMG_SIZE,
                                     N_CLASSES,
                                     id_colname=ID_COLUMNS,
                                     transforms=train_augmentation,
                                     crop_rate=1.0,
                                     class_y=y_train)
        val_dataset = SeverDataset(val_df,
                                   IMG_DIR,
                                   IMG_SIZE,
                                   N_CLASSES,
                                   id_colname=ID_COLUMNS,
                                   transforms=val_augmentation)
        train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6)
        train_loader = DataLoader(train_dataset,
                                  batch_size=BATCH_SIZE,
                                  sampler=train_sampler,
                                  num_workers=8,
                                  pin_memory=True)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=False,
                                num_workers=8,
                                pin_memory=True)

        del train_df, val_df, df, train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        model = smp.Unet('se_resnext50_32x4d',
                         encoder_weights="imagenet",
                         classes=N_CLASSES,
                         encoder_se_module=True,
                         decoder_semodule=True,
                         h_columns=False,
                         skip=True,
                         act="swish",
                         freeze_bn=True,
                         classification=CLASSIFICATION,
                         attention_type="cbam",
                         center=True,
                         mode="train")
        #model = convert_model(model)
        if base_model is not None:
            model.load_state_dict(torch.load(base_model))
        model.to(device)

        criterion = torch.nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam([
            {
                'params': model.decoder.parameters(),
                'lr': 3e-3
            },
            {
                'params': model.encoder.parameters(),
                'lr': 3e-4
            },
        ],
                                     eps=1e-4)
        if base_model is None:
            scheduler_cosine = CosineAnnealingLR(optimizer,
                                                 T_max=CLR_CYCLE,
                                                 eta_min=3e-5)
            scheduler = GradualWarmupScheduler(
                optimizer,
                multiplier=1.1,
                total_epoch=CLR_CYCLE * 2,
                after_scheduler=scheduler_cosine)
        else:
            scheduler = CosineAnnealingLR(optimizer,
                                          T_max=CLR_CYCLE,
                                          eta_min=3e-5)

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)

        if EMA:
            ema_model = copy.deepcopy(model)
            if base_model_ema is not None:
                ema_model.load_state_dict(torch.load(base_model_ema))
            ema_model.to(device)
            ema_model = torch.nn.DataParallel(ema_model)
        else:
            ema_model = None
        model = torch.nn.DataParallel(model)

    with timer('train'):
        train_losses = []
        valid_losses = []

        best_model_loss = 999
        best_model_ema_loss = 999
        best_model_ep = 0
        ema_decay = 0
        checkpoint = base_ckpt + 1

        for epoch in range(84, EPOCHS + 1):
            seed = seed + epoch
            seed_torch(seed)

            if epoch >= EMA_START:
                ema_decay = 0.99

            LOGGER.info("Starting {} epoch...".format(epoch))
            tr_loss = train_one_epoch(model,
                                      train_loader,
                                      criterion,
                                      optimizer,
                                      device,
                                      cutmix_prob=0.0,
                                      classification=CLASSIFICATION,
                                      ema_model=ema_model,
                                      ema_decay=ema_decay)
            train_losses.append(tr_loss)
            LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5)))

            valid_loss = validate(model,
                                  val_loader,
                                  criterion,
                                  device,
                                  classification=CLASSIFICATION)
            valid_losses.append(valid_loss)
            LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5)))

            if EMA and epoch >= EMA_START:
                ema_valid_loss = validate(ema_model,
                                          val_loader,
                                          criterion,
                                          device,
                                          classification=CLASSIFICATION)
                LOGGER.info('Mean EMA valid loss: {}'.format(
                    round(ema_valid_loss, 5)))

                if ema_valid_loss < best_model_ema_loss:
                    torch.save(
                        ema_model.module.state_dict(),
                        'models/{}_fold{}_ckpt{}_ema.pth'.format(
                            EXP_ID, FOLD_ID, checkpoint))
                    best_model_ema_loss = ema_valid_loss

            scheduler.step()

            if valid_loss < best_model_loss:
                torch.save(
                    model.module.state_dict(),
                    'models/{}_fold{}_ckpt{}.pth'.format(
                        EXP_ID, FOLD_ID, checkpoint))
                best_model_loss = valid_loss
                best_model_ep = epoch
                #np.save("val_pred.npy", val_pred)

            if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1:
                torch.save(
                    model.module.state_dict(),
                    'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID))
                LOGGER.info('Best valid loss: {} on epoch={}'.format(
                    round(best_model_loss, 5), best_model_ep))
                if EMA:
                    torch.save(
                        ema_model.module.state_dict(),
                        'models/{}_fold{}_latest_ema.pth'.format(
                            EXP_ID, FOLD_ID))
                    LOGGER.info('Best ema valid loss: {}'.format(
                        round(best_model_ema_loss, 5)))
                checkpoint += 1
                best_model_loss = 999

            #del val_pred
            gc.collect()

    LOGGER.info('Best valid loss: {} on epoch={}'.format(
        round(best_model_loss, 5), best_model_ep))

    xs = list(range(1, len(train_losses) + 1))
    plt.plot(xs, train_losses, label='Train loss')
    plt.plot(xs, valid_losses, label='Val loss')
    plt.legend()
    plt.xticks(xs)
    plt.xlabel('Epochs')
    plt.savefig("loss.png")
Пример #29
0
    def train(self):
        device = self.device
        print('Running on device: {}'.format(device), 'start training...')
        print(
            f'Setting - Epochs: {self.num_epochs}, Learning rate: {self.learning_rate} '
        )

        train_loader = self.train_loader
        valid_loader = self.valid_loader

        model = self.model.to(device)
        if self.optimizer == 0:
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=self.learning_rate,
                                         weight_decay=1e-5)
        elif self.optimizer == 1:
            optimizer = torch.optim.AdamW(model.parameters(),
                                          lr=self.learning_rate,
                                          weight_decay=1e-5)
        elif self.optimizer == 2:
            optimizer = MADGRAD(model.parameters(),
                                lr=self.learning_rate,
                                weight_decay=1e-5)
        elif self.optimizer == 3:
            optimizer = AdamP(model.parameters(),
                              lr=self.learning_rate,
                              weight_decay=1e-5)
        criterion = torch.nn.CrossEntropyLoss().to(device)

        if self.use_swa:
            optimizer = SWA(optimizer, swa_start=2, swa_freq=2, swa_lr=1e-5)

        # scheduler #
        scheduler_dct = {
            0:
            None,
            1:
            StepLR(optimizer, 10, gamma=0.5),
            2:
            ReduceLROnPlateau(optimizer,
                              'min',
                              factor=0.4,
                              patience=int(0.3 *
                                           self.early_stopping_patience)),
            3:
            CosineAnnealingLR(optimizer, T_max=5, eta_min=0.)
        }
        scheduler = scheduler_dct[self.scheduler]

        # early stopping
        early_stopping = EarlyStopping(patience=self.early_stopping_patience,
                                       verbose=True,
                                       path=f'checkpoint_{self.job}.pt')

        # training
        self.train_loss_lst = list()
        self.train_acc_lst = list()
        self.val_loss_lst = list()
        self.val_acc_lst = list()
        for epoch in range(1, self.num_epochs + 1):
            with tqdm(train_loader, unit='batch') as tepoch:
                avg_val_loss, avg_val_acc = None, None

                for idx, (img, label) in enumerate(tepoch):
                    tepoch.set_description(f"Epoch {epoch}")

                    model.train()
                    optimizer.zero_grad()

                    img, label = img.float().to(device), label.long().to(
                        device)

                    output = model(img)
                    loss = criterion(output, label)
                    predictions = output.argmax(dim=1, keepdim=True).squeeze()
                    correct = (predictions == label).sum().item()
                    accuracy = correct / len(img)

                    loss.backward()
                    optimizer.step()

                    if idx == len(train_loader) - 1:

                        val_loss_lst, val_acc_lst = list(), list()

                        model.eval()
                        with torch.no_grad():
                            for val_img, val_label in valid_loader:
                                val_img, val_label = val_img.float().to(
                                    device), val_label.long().to(device)

                                val_out = model(val_img)
                                val_loss = criterion(val_out, val_label)
                                val_pred = val_out.argmax(
                                    dim=1, keepdim=True).squeeze()
                                val_acc = (val_pred == val_label
                                           ).sum().item() / len(val_img)

                                val_loss_lst.append(val_loss.item())
                                val_acc_lst.append(val_acc)

                        avg_val_loss = np.mean(val_loss_lst)
                        avg_val_acc = np.mean(val_acc_lst) * 100.

                        self.train_loss_lst.append(loss)
                        self.train_acc_lst.append(accuracy)
                        self.val_loss_lst.append(avg_val_loss)
                        self.val_acc_lst.append(avg_val_acc)

                    if scheduler is not None:
                        current_lr = optimizer.param_groups[0]['lr']
                    else:
                        current_lr = self.learning_rate

                    # log
                    tepoch.set_postfix(loss=loss.item(),
                                       accuracy=100. * accuracy,
                                       val_loss=avg_val_loss,
                                       val_acc=avg_val_acc,
                                       current_lr=current_lr)

                # early stopping check
                early_stopping(avg_val_loss, model)
                if early_stopping.early_stop:
                    print("Early stopping")
                    break

                # scheduler update
                if scheduler is not None:
                    if self.scheduler == 2:
                        scheduler.step(avg_val_loss)
                    else:
                        scheduler.step()
        if self.use_swa:
            optimizer.swap_swa_sgd()
        self.model.load_state_dict(torch.load(f'checkpoint_{self.job}.pt'))
def main(cfg):
    """Runs main training procedure."""

    # fix random seeds for reproducibility
    seed_everything(seed=cfg['seed'])

    # neptune logging
    neptune.init(project_qualified_name=cfg['neptune_project_name'],
                 api_token=cfg['neptune_api_token'])

    neptune.create_experiment(name=cfg['neptune_experiment'], params=cfg)

    print('Preparing model and data...')
    print('Using SMP version:', smp.__version__)

    num_classes = 1 if len(cfg['classes']) == 1 else (len(cfg['classes']) + 1)
    activation = 'sigmoid' if num_classes == 1 else 'softmax2d'
    background = False if cfg['ignore_channels'] else True
    binary = True if num_classes == 1 else False
    softmax = False if num_classes == 1 else True
    sigmoid = True if num_classes == 1 else False

    aux_params = dict(
        pooling=cfg['pooling'],  # one of 'avg', 'max'
        dropout=cfg['dropout'],  # dropout ratio, default is None
        activation='sigmoid',  # activation function, default is None
        classes=num_classes)  # define number of output labels

    # configure model
    models = {
        'unet':
        Unet(encoder_name=cfg['encoder_name'],
             encoder_weights=cfg['encoder_weights'],
             decoder_use_batchnorm=cfg['use_batchnorm'],
             classes=num_classes,
             activation=activation,
             aux_params=aux_params),
        'pspnet':
        PSPNet(encoder_name=cfg['encoder_name'],
               encoder_weights=cfg['encoder_weights'],
               classes=num_classes,
               activation=activation,
               aux_params=aux_params),
        'pan':
        PAN(encoder_name=cfg['encoder_name'],
            encoder_weights=cfg['encoder_weights'],
            classes=num_classes,
            activation=activation,
            aux_params=aux_params),
        'deeplabv3plus':
        DeepLabV3Plus(encoder_name=cfg['encoder_name'],
                      encoder_weights=cfg['encoder_weights'],
                      classes=num_classes,
                      activation=activation,
                      aux_params=aux_params)
    }

    assert cfg['architecture'] in models.keys()
    model = models[cfg['architecture']]

    # configure loss
    losses = {
        'dice_loss':
        DiceLoss(include_background=background,
                 softmax=softmax,
                 sigmoid=sigmoid,
                 batch=cfg['combine']),
        'generalized_dice':
        GeneralizedDiceLoss(include_background=background,
                            softmax=softmax,
                            sigmoid=sigmoid,
                            batch=cfg['combine'])
    }

    assert cfg['loss'] in losses.keys()
    loss = losses[cfg['loss']]

    # configure optimizer
    optimizers = {
        'adam': Adam([dict(params=model.parameters(), lr=cfg['lr'])]),
        'adamw': AdamW([dict(params=model.parameters(), lr=cfg['lr'])]),
        'rmsprop': RMSprop([dict(params=model.parameters(), lr=cfg['lr'])])
    }

    assert cfg['optimizer'] in optimizers.keys()
    optimizer = optimizers[cfg['optimizer']]

    # configure metrics
    metrics = {
        'dice_score':
        DiceMetric(include_background=background, reduction='mean'),
        'dice_smp':
        Fscore(threshold=cfg['rounding'],
               ignore_channels=cfg['ignore_channels']),
        'iou_smp':
        IoU(threshold=cfg['rounding'], ignore_channels=cfg['ignore_channels']),
        'generalized_dice':
        GeneralizedDiceLoss(include_background=background,
                            softmax=softmax,
                            sigmoid=sigmoid,
                            batch=cfg['combine']),
        'dice_loss':
        DiceLoss(include_background=background,
                 softmax=softmax,
                 sigmoid=sigmoid,
                 batch=cfg['combine']),
        'cross_entropy':
        BCELoss(reduction='mean'),
        'accuracy':
        Accuracy(ignore_channels=cfg['ignore_channels'])
    }

    assert all(m['name'] in metrics.keys() for m in cfg['metrics'])
    metrics = [(metrics[m['name']], m['name'], m['type'])
               for m in cfg['metrics']]  # tuple of (metric, name, type)

    # TODO: Fix metric names

    # configure scheduler
    schedulers = {
        'steplr':
        StepLR(optimizer, step_size=cfg['step_size'], gamma=0.5),
        'cosine':
        CosineAnnealingLR(optimizer,
                          cfg['epochs'],
                          eta_min=cfg['eta_min'],
                          last_epoch=-1)
    }

    assert cfg['scheduler'] in schedulers.keys()
    scheduler = schedulers[cfg['scheduler']]

    # configure augmentations
    train_transform = load_train_transform(transform_type=cfg['transform'],
                                           patch_size=cfg['patch_size_train'])
    valid_transform = load_valid_transform(
        patch_size=cfg['patch_size_valid'])  # manually selected patch size

    train_dataset = ArtifactDataset(df_path=cfg['train_data'],
                                    classes=cfg['classes'],
                                    transform=train_transform,
                                    normalize=cfg['normalize'],
                                    ink_filters=cfg['ink_filters'])

    valid_dataset = ArtifactDataset(df_path=cfg['valid_data'],
                                    classes=cfg['classes'],
                                    transform=valid_transform,
                                    normalize=cfg['normalize'],
                                    ink_filters=cfg['ink_filters'])

    test_dataset = ArtifactDataset(df_path=cfg['test_data'],
                                   classes=cfg['classes'],
                                   transform=valid_transform,
                                   normalize=cfg['normalize'],
                                   ink_filters=cfg['ink_filters'])

    # load pre-sampled patch arrays
    train_image, train_mask = train_dataset[0]
    valid_image, valid_mask = valid_dataset[0]
    print('Shape of image patch', train_image.shape)
    print('Shape of mask patch', train_mask.shape)
    print('Train dataset shape:', len(train_dataset))
    print('Valid dataset shape:', len(valid_dataset))
    assert train_image.shape[1] == cfg[
        'patch_size_train'] and train_image.shape[2] == cfg['patch_size_train']
    assert valid_image.shape[1] == cfg[
        'patch_size_valid'] and valid_image.shape[2] == cfg['patch_size_valid']

    # save intermediate augmentations
    if cfg['eval_dir']:
        default_dataset = ArtifactDataset(df_path=cfg['train_data'],
                                          classes=cfg['classes'],
                                          transform=None,
                                          normalize=None,
                                          ink_filters=cfg['ink_filters'])

        transform_dataset = ArtifactDataset(df_path=cfg['train_data'],
                                            classes=cfg['classes'],
                                            transform=train_transform,
                                            normalize=None,
                                            ink_filters=cfg['ink_filters'])

        for idx in range(0, min(500, len(train_dataset)), 10):
            image_input, image_mask = default_dataset[idx]
            image_input = image_input.transpose((1, 2, 0)).astype(np.uint8)

            image_mask = image_mask.transpose(1, 2, 0)
            image_mask = np.argmax(
                image_mask, axis=2) if not binary else image_mask.squeeze()
            image_mask = image_mask.astype(np.uint8)

            image_transform, _ = transform_dataset[idx]
            image_transform = image_transform.transpose(
                (1, 2, 0)).astype(np.uint8)

            idx_str = str(idx).zfill(3)
            skimage.io.imsave(os.path.join(cfg['eval_dir'],
                                           f'{idx_str}a_image_input.png'),
                              image_input,
                              check_contrast=False)
            plt.imsave(os.path.join(cfg['eval_dir'],
                                    f'{idx_str}b_image_mask.png'),
                       image_mask,
                       vmin=0,
                       vmax=6,
                       cmap='Spectral')
            skimage.io.imsave(os.path.join(cfg['eval_dir'],
                                           f'{idx_str}c_image_transform.png'),
                              image_transform,
                              check_contrast=False)

        del transform_dataset

    # update process
    print('Starting training...')
    print('Available GPUs for training:', torch.cuda.device_count())

    # pytorch module wrapper
    class DataParallelModule(torch.nn.DataParallel):
        def __getattr__(self, name):
            try:
                return super().__getattr__(name)
            except AttributeError:
                return getattr(self.module, name)

    # data parallel training
    if torch.cuda.device_count() > 1:
        model = DataParallelModule(model)

    train_loader = DataLoader(train_dataset,
                              batch_size=cfg['batch_size'],
                              num_workers=cfg['workers'],
                              shuffle=True)

    valid_loader = DataLoader(valid_dataset,
                              batch_size=int(cfg['batch_size'] / 4),
                              num_workers=cfg['workers'],
                              shuffle=False)

    test_loader = DataLoader(test_dataset,
                             batch_size=int(cfg['batch_size'] / 4),
                             num_workers=cfg['workers'],
                             shuffle=False)

    trainer = Trainer(model=model,
                      device=cfg['device'],
                      save_checkpoints=cfg['save_checkpoints'],
                      checkpoint_dir=cfg['checkpoint_dir'],
                      checkpoint_name=cfg['checkpoint_name'])

    trainer.compile(optimizer=optimizer,
                    loss=loss,
                    metrics=metrics,
                    num_classes=num_classes)

    trainer.fit(train_loader,
                valid_loader,
                epochs=cfg['epochs'],
                scheduler=scheduler,
                verbose=cfg['verbose'],
                loss_weight=cfg['loss_weight'],
                test_loader=test_loader,
                binary=binary)

    # validation inference
    model.load_state_dict(
        torch.load(os.path.join(cfg['checkpoint_dir'],
                                cfg['checkpoint_name'])))
    model.to(cfg['device'])
    model.eval()

    # save best checkpoint to neptune
    neptune.log_artifact(
        os.path.join(cfg['checkpoint_dir'], cfg['checkpoint_name']))

    # setup directory to save plots
    if os.path.isdir(cfg['plot_dir_valid']):
        shutil.rmtree(cfg['plot_dir_valid'])
    os.makedirs(cfg['plot_dir_valid'], exist_ok=True)

    # valid dataset without transformations and normalization for image visualization
    valid_dataset_vis = ArtifactDataset(df_path=cfg['valid_data'],
                                        classes=cfg['classes'],
                                        ink_filters=cfg['ink_filters'])

    # keep track of valid masks
    valid_preds = []
    valid_masks = []

    if cfg['save_checkpoints']:
        print('Predicting valid patches...')
        for n in range(len(valid_dataset)):
            image_vis = valid_dataset_vis[n][0].astype('uint8')
            image_vis = image_vis.transpose(1, 2, 0)
            image, gt_mask = valid_dataset[n]
            gt_mask = gt_mask.transpose(1, 2, 0)
            gt_mask = np.argmax(gt_mask,
                                axis=2) if not binary else gt_mask.squeeze()
            gt_mask = gt_mask.astype(np.uint8)
            valid_masks.append(gt_mask)

            x_tensor = torch.from_numpy(image).to(cfg['device']).unsqueeze(0)
            pr_mask, _ = model.predict(x_tensor)
            pr_mask = pr_mask.squeeze(axis=0).cpu().numpy().round()
            pr_mask = pr_mask.transpose(1, 2, 0)
            pr_mask = np.argmax(pr_mask,
                                axis=2) if not binary else pr_mask.squeeze()
            pr_mask = pr_mask.astype(np.uint8)
            valid_preds.append(pr_mask)

            save_predictions(out_path=cfg['plot_dir_valid'],
                             index=n + 1,
                             image=image_vis,
                             ground_truth_mask=gt_mask,
                             predicted_mask=pr_mask)

    del train_dataset, valid_dataset
    del train_loader, valid_loader

    # calculate dice per class
    valid_masks = np.stack(valid_masks, axis=0)
    valid_masks = valid_masks.flatten()
    valid_preds = np.stack(valid_preds, axis=0)
    valid_preds = valid_preds.flatten()
    dice_score = f1_score(y_true=valid_masks, y_pred=valid_preds, average=None)
    neptune.log_text('valid_dice_class', str(dice_score))
    print('Valid dice score (class):', str(dice_score))

    if cfg['evaluate_test_set']:
        print('Predicting test patches...')

        # setup directory to save plots
        if os.path.isdir(cfg['plot_dir_test']):
            shutil.rmtree(cfg['plot_dir_test'])
        os.makedirs(cfg['plot_dir_test'], exist_ok=True)

        # test dataset without transformations and normalization for image visualization
        test_dataset_vis = ArtifactDataset(df_path=cfg['test_data'],
                                           classes=cfg['classes'],
                                           ink_filters=cfg['ink_filters'])

        # keep track of test masks
        test_masks = []
        test_preds = []

        for n in range(len(test_dataset)):
            image_vis = test_dataset_vis[n][0].astype('uint8')
            image_vis = image_vis.transpose(1, 2, 0)
            image, gt_mask = test_dataset[n]
            gt_mask = gt_mask.transpose(1, 2, 0)
            gt_mask = np.argmax(gt_mask,
                                axis=2) if not binary else gt_mask.squeeze()
            gt_mask = gt_mask.astype(np.uint8)
            test_masks.append(gt_mask)

            x_tensor = torch.from_numpy(image).to(cfg['device']).unsqueeze(0)
            pr_mask, _ = model.predict(x_tensor)
            pr_mask = pr_mask.squeeze(axis=0).cpu().numpy().round()
            pr_mask = pr_mask.transpose(1, 2, 0)
            pr_mask = np.argmax(pr_mask,
                                axis=2) if not binary else pr_mask.squeeze()
            pr_mask = pr_mask.astype(np.uint8)
            test_preds.append(pr_mask)

            save_predictions(out_path=cfg['plot_dir_test'],
                             index=n + 1,
                             image=image_vis,
                             ground_truth_mask=gt_mask,
                             predicted_mask=pr_mask)

            # calculate dice per class
            test_masks = np.stack(test_masks, axis=0)
            test_masks = test_masks.flatten()
            test_preds = np.stack(test_preds, axis=0)
            test_preds = test_preds.flatten()
            dice_score = f1_score(y_true=test_masks,
                                  y_pred=test_preds,
                                  average=None)
            neptune.log_text('test_dice_class', str({dice_score}))
            print('Test dice score (class):', str(dice_score))

    # end of training process
    print('Finished training!')