Exemplo n.º 1
0
def train(initial_checkpoint):

    ## setup  -----------------
    os.makedirs(CHECKPOINTS + '/checkpoint', exist_ok=True)
    os.makedirs(CHECKPOINTS + '/train', exist_ok=True)
    os.makedirs(CHECKPOINTS + '/backup', exist_ok=True)

    log = Logger()
    log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64))
    log.write('\tSEED         = %u\n' % SEED)
    log.write('\tPROJECT_PATH = %s\n' % CODE)
    log.write('\t__file__     = %s\n' % __file__)
    log.write('\tRESULT      = %s\n' % CHECKPOINTS)
    log.write('\n')
    log.write('\t<additional comments>\n')
    log.write('\t  ... \n')
    log.write('\n')

    ## dataset ----------------------------------------
    log.write('Configuring dataset...\n')
    batch_size = 16

    train_dataset = TGSDataset(
        'list_train' + str(FOLD) + '_3600' + ne + "_balanced", train_augment,
        'train')
    os.makedirs(CHECKPOINTS + '/list_train' + str(FOLD) + '_3600' + ne +
                "_balanced",
                exist_ok=True)
    train_loader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        #sampler     = ConstantSampler(train_dataset,[31]*batch_size*100),
        batch_size=batch_size,
        drop_last=True,
        num_workers=8,
        pin_memory=True,
        collate_fn=null_collate)

    valid_dataset = TGSDataset(
        'list_valid' + str(FOLD) + '_400' + ne + "_balanced", valid_augment,
        'train')
    valid_loader = DataLoader(valid_dataset,
                              sampler=RandomSampler(valid_dataset),
                              batch_size=batch_size,
                              drop_last=False,
                              num_workers=8,
                              pin_memory=True,
                              collate_fn=null_collate)

    assert (len(train_dataset) >= batch_size)
    log.write('batch_size = %d\n' % (batch_size))
    log.write('train_dataset.split = %s\n' % (train_dataset.split))
    log.write('valid_dataset.split = %s\n' % (valid_dataset.split))
    log.write('\n')

    #debug
    if 0:  #debug  ##-------------------------------

        for input, truth, index, cache in train_loader:
            images = input.cpu().data.numpy().squeeze()
            masks = truth.cpu().data.numpy().squeeze()
            batch_size = len(index)
            for b in range(batch_size):
                image = images[b] * 255
                image = np.dstack([image, image, image])

                mask = masks[b]

                image_show('image', image, resize=2)
                image_show_norm('mask', mask, max=1, resize=2)

                overlay0 = draw_mask_overlay(mask, image, color=[0, 0, 255])
                overlay0 = draw_mask_to_contour_overlay(mask,
                                                        overlay0,
                                                        2,
                                                        color=[0, 0, 255])

                image_show('overlay0', overlay0, resize=2)
                cv2.waitKey(0)
    #--------------------------------------

    ## net ----------------------------------------
    log.write('Configuring neural network...\n')
    net = Net().cuda()

    if initial_checkpoint is not None:
        log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint)
        net.load_state_dict(
            torch.load(initial_checkpoint,
                       map_location=lambda storage, loc: storage))

    log.write("The net is an instance of {}.".format(type(net)))
    log.write('\n')

    ## optimiser ----------------------------------
    num_iters = 300 * 1000
    iter_smooth = 20
    iter_log = 50
    iter_valid = 100
    iter_save   = [0, num_iters-1]\
                   + list(range(0,num_iters,500))#1*1000

    FREEZE = False
    #------------------------------------------------------
    if FREEZE:  ##freeze
        for p in net.feature_net.parameters():
            p.requires_grad = False
    #------------------------------------------------------

    scheduler = lambda x: (0.01 / 2) * (np.cos(PI * (np.mod(
        x - 1, 300 * 1000 / 30) / (300 * 1000 / 30))) + 1)
    #log.write(scheduler(1))
    #log.write(scheduler(5000))
    #log.write(scheduler(10001))

    optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),
                          lr=0.01,
                          momentum=0.9,
                          weight_decay=0.0001)

    start_iter = 0
    start_epoch = 0
    if initial_checkpoint is not None:
        checkpoint = torch.load(
            initial_checkpoint.replace('_model.pth', '_optimizer.pth'))
        start_iter = checkpoint['iter']
        start_epoch = checkpoint['epoch']

        rate = get_learning_rate(optimizer)  #load all except learning rate
        optimizer.load_state_dict(checkpoint['optimizer'])
        adjust_learning_rate(optimizer, rate)
        pass

    ## start training here! ##############################################
    log.write('Start training...\n')

    log.write(
        ' rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          \n'
    )
    log.write(
        '-------------------------------------------------------------------------------------------------------------------------------\n'
    )

    train_loss = np.zeros(6, np.float32)
    valid_loss = np.zeros(6, np.float32)
    batch_loss = np.zeros(6, np.float32)
    rate = 0
    iter = 0
    i = 0

    start = timer()
    while iter < num_iters:  # loop over the dataset multiple times
        sum_train_loss = np.zeros(6, np.float32)
        sum = 0

        optimizer.zero_grad()
        for input, truth, index, cache in train_loader:

            if 0:  #debug  ##-------------------------------

                image = input.cpu().data.numpy().squeeze()
                mask = truth.cpu().data.numpy().squeeze()

                batch_size = len(index)
                for b in range(batch_size):
                    image_show_norm('image', image[b], max=1, resize=2)
                    image_show_norm('mask', mask[b], max=1, resize=2)
                    cv2.waitKey(0)
            #--------------------------------------

            len_train_dataset = len(train_dataset)
            batch_size = len(index)
            iter = i + start_iter
            epoch = (iter -
                     start_iter) * batch_size / len_train_dataset + start_epoch
            num_samples = epoch * len_train_dataset

            if iter % iter_valid == 0:
                net.set_mode('valid')
                valid_loss = validation(net, valid_loader)
                net.set_mode('train')

                log.write2('\r')
                log.write('%0.4f  %5.1f  %6.1f  |  %0.3f  %0.3f  (%0.3f) |  %0.3f  %0.3f  |  %0.3f  %0.3f  | %s \n' % (\
                         rate, iter/1000, epoch,
                         valid_loss[0], valid_loss[1], valid_loss[2],
                         train_loss[0], train_loss[1],
                         batch_loss[0], batch_loss[1],
                         time_to_str((timer() - start),'min')))
                time.sleep(0.01)

            if iter in iter_save:
                torch.save(
                    net.state_dict(), CHECKPOINTS + "/" + train_dataset.split +
                    '/' + MODEL + OHEM + '%08d_model.pth' % (iter))
                torch.save(
                    {
                        'optimizer': optimizer.state_dict(),
                        'iter': iter,
                        'epoch': epoch,
                    }, CHECKPOINTS + "/" + train_dataset.split + '/' + MODEL +
                    OHEM + '%08d_optimizer.pth' % (iter))
                pass

            # learning rate schduler -------------
            if scheduler is not None:
                #scheduler.batch_step()
                lr = scheduler(iter)
                if lr < 0: break
                adjust_learning_rate(optimizer, lr)
            rate = get_learning_rate(optimizer)
            #rate = 0.01

            # one iteration update  -------------
            #net.set_mode('train',is_freeze_bn=True)
            net.set_mode('train')

            input = input.cuda()
            truth = truth.cuda()

            logit = data_parallel(net, input)  #net(input)

            if OHEM == "OHEM":
                loss = net.focal_loss(logit, truth, 1.0, 0.5,
                                      0.25) + net.criterion(logit, truth)
            else:
                loss = net.criterion(logit, truth)

            dice = net.metric(logit, truth)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            #torch.nn.utils.clip_grad_norm(net.parameters(), 1)

            # print statistics  ------------
            batch_loss = np.array((
                loss.item(),
                dice.item(),
                0,
                0,
                0,
                0,
            ))
            sum_train_loss += batch_loss
            sum += 1
            if iter % iter_smooth == 0:
                train_loss = sum_train_loss / sum
                sum_train_loss = np.zeros(6, np.float32)
                sum = 0



            log.write2('\r%0.4f  %5.1f  %6.1f  |  %0.3f  %0.3f  (%0.3f) |  %0.3f  %0.3f  |  %0.3f  %0.3f  | %s ' % (\
                         rate, iter/1000, epoch,
                         valid_loss[0], valid_loss[1], valid_loss[2],
                         train_loss[0], train_loss[1],
                         batch_loss[0], batch_loss[1],
                         time_to_str((timer() - start), 'min')))
            i = i + 1

        pass  #-- end of one data loader --
    pass  #-- end of all iterations --

    log.write('\n')
Exemplo n.º 2
0
                torch.load(initial_checkpoint,
                           map_location=lambda storage, loc: storage))

            valid_num = 0
            valid_loss = np.zeros(3, np.float32)

            predicts_l = []
            truths_l = []

            for input, truth, index, cache in valid_loader:
                input = input.cuda()
                truth = truth.cuda()
                with torch.no_grad():
                    logit = data_parallel(model, input)  #net(input)
                    prob = F.sigmoid(logit)
                loss = model.criterion(logit, truth)
                dice = model.metric(logit, truth)

                batch_size = len(index)
                valid_loss += batch_size * np.array(
                    (loss.item(), dice.item(), 0))
                valid_num += batch_size

                prob = prob[:, :, Y0:Y1, X0:X1]
                truth = truth[:, :, Y0:Y1, X0:X1]
                prob = F.avg_pool2d(prob, kernel_size=2, stride=2)
                truth = F.avg_pool2d(truth, kernel_size=2, stride=2)
                predicts.append(prob.data.cpu().numpy())
                truths.append(truth.data.cpu().numpy())

                #assert(valid_num == len(valid_loader.sampler))
def train():

    initial_checkpoint = None
    #'checkpoint/00048500_model.pth'\
    #    None  #'/root/share/project/kaggle/tgs/results/resnet34-resize128-focus/fold0-1a/checkpoint/00003500_model.pth'

    ## setup  -----------------
    os.makedirs(CHECKPOINTS + '/checkpoint', exist_ok=True)
    os.makedirs(CHECKPOINTS + '/train', exist_ok=True)
    os.makedirs(CHECKPOINTS + '/backup', exist_ok=True)
    #backup_project_as_zip(PROJECT_PATH, RESULT +'/backup/code.train.%s.zip'%IDENTIFIER)

    log = Logger()
    #log.open(RESULT+'/log.train.txt',mode='a')
    print('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64))
    print('\tSEED         = %u\n' % SEED)
    print('\tPROJECT_PATH = %s\n' % CODE)
    print('\t__file__     = %s\n' % __file__)
    print('\tRESULT      = %s\n' % CHECKPOINTS)
    print('\n')
    print('\t<additional comments>\n')
    print('\t  ... \n')
    print('\n')

    ## dataset ----------------------------------------
    print('Configuring dataset...\n')
    batch_size = 16

    train_dataset = TGSDataset('list_train8_3600', train_augment, 'train')
    os.makedirs(CHECKPOINTS + '/list_train8_3600', exist_ok=True)
    train_loader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        #sampler     = ConstantSampler(train_dataset,[31]*batch_size*100),
        batch_size=batch_size,
        drop_last=True,
        num_workers=8,
        pin_memory=True,
        collate_fn=null_collate)

    valid_dataset = TGSDataset('list_valid8_400', valid_augment, 'train')
    valid_loader = DataLoader(valid_dataset,
                              sampler=RandomSampler(valid_dataset),
                              batch_size=batch_size,
                              drop_last=False,
                              num_workers=8,
                              pin_memory=True,
                              collate_fn=null_collate)

    assert (len(train_dataset) >= batch_size)
    print('batch_size = %d\n' % (batch_size))
    print('train_dataset.split = %s\n' % (train_dataset.split))
    print('valid_dataset.split = %s\n' % (valid_dataset.split))
    print('\n')

    #debug
    if 0:  #debug  ##-------------------------------

        for input, truth, index, cache in train_loader:
            images = input.cpu().data.numpy().squeeze()
            masks = truth.cpu().data.numpy().squeeze()
            batch_size = len(index)
            for b in range(batch_size):
                image = images[b] * 255
                image = np.dstack([image, image, image])

                mask = masks[b]

                image_show('image', image, resize=2)
                image_show_norm('mask', mask, max=1, resize=2)

                overlay0 = draw_mask_overlay(mask, image, color=[0, 0, 255])
                overlay0 = draw_mask_to_contour_overlay(mask,
                                                        overlay0,
                                                        2,
                                                        color=[0, 0, 255])

                image_show('overlay0', overlay0, resize=2)
                cv2.waitKey(0)
    #--------------------------------------

    ## net ----------------------------------------
    print('Configuring neural network...\n')
    net = Net().cuda()

    if initial_checkpoint is not None:
        print('\tinitial_checkpoint = %s\n' % initial_checkpoint)
        net.load_state_dict(
            torch.load(initial_checkpoint,
                       map_location=lambda storage, loc: storage))

    print("The net is an instance of {}.".format(type(net)))
    print('\n')

    ## optimiser ----------------------------------
    num_iters = 300 * 1000
    iter_smooth = 20
    iter_log = 50
    iter_valid = 100
    iter_save   = [0, num_iters-1]\
                   + list(range(0,num_iters,500))#1*1000

    FREEZE = False
    #------------------------------------------------------
    if FREEZE:  ##freeze
        for p in net.feature_net.parameters():
            p.requires_grad = False

    #from cls import CyclicLR
    #net.set_mode('train',is_freeze_bn=True)
    #------------------------------------------------------
    scheduler = lambda x: (0.01 / 2) * (np.cos(PI * (np.mod(
        x - 1, 300 * 1000 / 30) / (300 * 1000 / 30))) + 1)
    print(scheduler(1))
    print(scheduler(5000))
    print(scheduler(10001))
    #scheduler = CyclicLR(base_lr=0.01, max_lr=0.01, step_size=10000, gamma=1., scale_fn=clr_fn, scale_mode='iterations')
    #schduler = None #StepLR([ (0, 0.01),  (200, 0.001)])
    #base_params = list(map(id, net.resnet.parameters()))
    #decode_params = filter(lambda p: id(p) not in base_params, net.parameters())
    #params = [ {"params": decode_params, "lr": 0.01},
    #    {"params": net.resnet.parameters(), "lr": 0.005}, ]
    #optimizer = torch.optim.SGD(params, momentum=0.9, weight_decay=0.0001)
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),
                          lr=0.01,
                          momentum=0.9,
                          weight_decay=0.0001)
    #scheduler = CyclicLR(optimizer,base_lr=0.01, max_lr=0.01, step_size=10000, gamma=1., scale_fn=clr_fn, scale_mode='iterations')
    #scheduler= CyclicLR(optimizer, base_lr=0.001, max_lr=0.01, step_size=10000, gamma=0.99, mode='cos_anneal')

    start_iter = 0
    start_epoch = 0
    if initial_checkpoint is not None:
        checkpoint = torch.load(
            initial_checkpoint.replace('_model.pth', '_optimizer.pth'))
        start_iter = checkpoint['iter']
        start_epoch = checkpoint['epoch']

        rate = get_learning_rate(optimizer)  #load all except learning rate
        #optimizer.load_state_dict(checkpoint['optimizer'])
        adjust_learning_rate(optimizer, rate)
        pass

    ## start training here! ##############################################
    print('Start training...\n')

    #print(' samples_per_epoch = %d\n\n'%len(train_dataset))
    print(
        ' rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          \n'
    )
    print(
        '-------------------------------------------------------------------------------------------------------------------------------\n'
    )

    train_loss = np.zeros(6, np.float32)
    valid_loss = np.zeros(6, np.float32)
    batch_loss = np.zeros(6, np.float32)
    rate = 0
    iter = 0
    i = 0

    start = timer()
    while iter < num_iters:  # loop over the dataset multiple times
        sum_train_loss = np.zeros(6, np.float32)
        sum = 0

        optimizer.zero_grad()
        for input, truth, index, cache in train_loader:

            if 0:  #debug  ##-------------------------------

                image = input.cpu().data.numpy().squeeze()
                mask = truth.cpu().data.numpy().squeeze()

                batch_size = len(index)
                for b in range(batch_size):
                    image_show_norm('image', image[b], max=1, resize=2)
                    image_show_norm('mask', mask[b], max=1, resize=2)
                    cv2.waitKey(0)
            #--------------------------------------

            len_train_dataset = len(train_dataset)
            batch_size = len(index)
            iter = i + start_iter
            epoch = (iter -
                     start_iter) * batch_size / len_train_dataset + start_epoch
            num_samples = epoch * len_train_dataset

            if iter % iter_valid == 0:
                net.set_mode('valid')
                valid_loss = validation(net, valid_loader)
                net.set_mode('train')

                print('\r', end='', flush=True)
                print('%0.4f  %5.1f  %6.1f  |  %0.3f  %0.3f  (%0.3f) |  %0.3f  %0.3f  |  %0.3f  %0.3f  | %s \n' % (\
                         rate, iter/1000, epoch,
                         valid_loss[0], valid_loss[1], valid_loss[2],
                         train_loss[0], train_loss[1],
                         batch_loss[0], batch_loss[1],
                         time_to_str((timer() - start),'min')))
                time.sleep(0.01)

            #if 1:
            if iter in iter_save:
                torch.save(
                    net.state_dict(), CHECKPOINTS + "/" + train_dataset.split +
                    '/%08d_model.pth' % (iter))
                torch.save(
                    {
                        'optimizer': optimizer.state_dict(),
                        'iter': iter,
                        'epoch': epoch,
                    }, CHECKPOINTS + "/" + train_dataset.split +
                    '/%08d_optimizer.pth' % (iter))
                pass

            # learning rate schduler -------------
            if scheduler is not None:
                #scheduler.batch_step()
                lr = scheduler(iter)
                if lr < 0: break
                adjust_learning_rate(optimizer, lr)
            rate = get_learning_rate(optimizer)
            #rate = 0.01

            # one iteration update  -------------
            #net.set_mode('train',is_freeze_bn=True)
            net.set_mode('train')

            input = input.cuda()
            truth = truth.cuda()

            logit = data_parallel(net, input)  #net(input)
            loss = net.criterion(logit, truth)
            #loss = torch.nn.BCEWithLogitsLoss(logit,truth)
            dice = net.metric(logit, truth)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            #torch.nn.utils.clip_grad_norm(net.parameters(), 1)

            # print statistics  ------------
            batch_loss = np.array((
                loss.item(),
                dice.item(),
                0,
                0,
                0,
                0,
            ))
            sum_train_loss += batch_loss
            sum += 1
            if iter % iter_smooth == 0:
                train_loss = sum_train_loss / sum
                sum_train_loss = np.zeros(6, np.float32)
                sum = 0


            print('\r%0.4f  %5.1f  %6.1f  |  %0.3f  %0.3f  (%0.3f) |  %0.3f  %0.3f  |  %0.3f  %0.3f  | %s ' % (\
                         rate, iter/1000, epoch,
                         valid_loss[0], valid_loss[1], valid_loss[2],
                         train_loss[0], train_loss[1],
                         batch_loss[0], batch_loss[1],
                         time_to_str((timer() - start), 'min')), end='',flush=True)
            i = i + 1

            #<debug> ===================================================================
            if 0:
                #if iter%200==0:
                #voxel, aux, query, link, truth, cache = make_valid_batch(valid_dataset.dataset, batch_size=2)

                net.set_mode('test')  #
                with torch.no_grad():
                    logit = net(input)
                    prob = F.sigmoid(logit)
                    loss = net.criterion(logit, truth)
                    dice = net.metric(logit, truth)

                    if 0:
                        loss = net.criterion(logit, truth)
                        accuracy, hit_rate, precision_rate = net.metric(
                            logit, truth)
                        valid_loss[0] = loss.item()
                        valid_loss[1] = accuracy.item()
                        valid_loss[2] = hit_rate.item()
                        valid_loss[3] = precision_rate.item()

                #show only b in batch ---
                b = 1
                prob = prob.data.cpu().numpy()[b].squeeze()
                truth = truth.data.cpu().numpy()[b].squeeze()
                input = input.data.cpu().numpy()[b].squeeze()

                all = np.hstack([input, truth, prob])
                image_show_norm('all', all, max=1, resize=3)
                cv2.waitKey(100)

                net.set_mode('train')
            #<debug> ===================================================================

        pass  #-- end of one data loader --
    pass  #-- end of all iterations --

    if 1:  #save last
        torch.save(
            net.state_dict(), CHECKPOINTS + '/checkpoint/' +
            train_dataset.split + '/%d_model.pth' % (i))
        torch.save(
            {
                'optimizer': optimizer.state_dict(),
                'iter': i,
                'epoch': epoch,
            }, CHECKPOINTS + '/checkpoint/' + train_dataset.split +
            '/%d_optimizer.pth' % (i))

    print('\n')