Exemplo n.º 1
0
def handler(context):
    print(
        f'start training with parameters : {Parameters.as_dict()}, context : {context}'
    )

    try:
        dataset_alias = context.datasets  # for older version
    except AttributeError:
        dataset_alias = context['datasets']

    train_dataset_id, val_dataset_id = get_dataset_ids(dataset_alias)

    id2index, _ = set_categories(list(dataset_alias.values()))
    num_classes = len(id2index)
    num_classes += 1  # add for background class

    print(f'number of classes : {num_classes}')

    print("Start downloading datasets.")
    dataset_items = list(
        load_dataset_from_api(train_dataset_id, max_num=Parameters.MAX_ITEMS))
    print("Finish downloading datasets.")

    random.shuffle(dataset_items)
    if val_dataset_id is not None:
        val_dataset_items = list(
            load_dataset_from_api(val_dataset_id,
                                  max_num=Parameters.MAX_ITEMS))
        random.shuffle(val_dataset_items)
        train_dataset_items = dataset_items
    else:
        test_size = int(len(dataset_items) * Parameters.TEST_SIZE)
        train_dataset_items, val_dataset_items = dataset_items[
            test_size:], dataset_items[:test_size]

    train_dataset = ABEJAPlatformDataset(train_dataset_items,
                                         phase="train",
                                         transform=DataTransform(
                                             Parameters.IMG_SIZE,
                                             Parameters.MEANS))

    val_dataset = ABEJAPlatformDataset(val_dataset_items,
                                       phase="val",
                                       transform=DataTransform(
                                           Parameters.IMG_SIZE,
                                           Parameters.MEANS))

    print(f'train dataset : {len(train_dataset)}')
    print(f'val dataset : {len(val_dataset)}')

    train_dataloader = data.DataLoader(train_dataset,
                                       batch_size=Parameters.BATCH_SIZE,
                                       shuffle=Parameters.SHUFFLE,
                                       collate_fn=od_collate_fn)

    val_dataloader = data.DataLoader(val_dataset,
                                     batch_size=Parameters.BATCH_SIZE,
                                     shuffle=False,
                                     collate_fn=od_collate_fn)

    dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}
    print(f'data loaders : {dataloaders_dict}')

    ssd_cfg = {
        'num_classes':
        num_classes,  # number of classes including background class
        'input_size': Parameters.IMG_SIZE,
        'bbox_aspect_num': Parameters.BBOX_ASPECT_NUM,
        'feature_maps': Parameters.FEATURE_MAPS,
        'steps': Parameters.STEPS,
        'min_sizes': Parameters.MIN_SIZES,
        'max_sizes': Parameters.MAX_SIZES,
        'aspect_ratios': Parameters.ASPECT_RATIOS,
        'conf_thresh': Parameters.CONF_THRESHOLD,
        'top_k': Parameters.TOP_K,
        'nms_thresh': Parameters.NMS_THRESHOLD
    }
    net = SSD(phase="train", cfg=ssd_cfg)

    # TODO: better to host this file by ourselves
    # https://github.com/amdegroot/ssd.pytorch#training-ssd
    url = 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth'
    weight_file = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR,
                               'vgg16_reducedfc.pth')
    download(url, weight_file)

    vgg_weights = torch.load(weight_file)
    print('finish loading base network...')
    net.vgg.load_state_dict(vgg_weights)

    def weights_init(m):
        if isinstance(m, nn.Conv2d):
            init.kaiming_normal_(m.weight.data)
            if m.bias is not None:  # in case of bias
                nn.init.constant_(m.bias, 0.0)

    # apply initial values of He
    net.extras.apply(weights_init)
    net.loc.apply(weights_init)
    net.conf.apply(weights_init)

    # configure loss function
    criterion = MultiBoxLoss(jaccard_thresh=Parameters.OVERLAP_THRESHOLD,
                             neg_pos=Parameters.NEG_POS,
                             device=device)

    # configure optimizer
    optimizer = optim.SGD(net.parameters(),
                          lr=Parameters.LR,
                          momentum=Parameters.MOMENTUM,
                          dampening=Parameters.DAMPENING,
                          weight_decay=Parameters.WEIGHT_DECAY,
                          nesterov=Parameters.NESTEROV)

    # move network to device
    net.to(device)

    # NOTE: This flag allows to enable the inbuilt cudnn auto-tuner
    # to find the best algorithm to use for your hardware.
    # cf. https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/2
    torch.backends.cudnn.benchmark = True

    iteration = 1
    epoch_train_loss = 0.0
    epoch_val_loss = 0.0
    latest_epoch_train_loss = epoch_train_loss
    latest_epoch_val_loss = epoch_val_loss

    for epoch in range(Parameters.EPOCHS):

        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch + 1, Parameters.EPOCHS))
        print('-------------')

        # loop of train and validation for each epoch
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
                print('(train)')
            else:
                if (epoch + 1) % 10 == 0:
                    net.eval()
                    print('-------------')
                    print('(val)')
                else:
                    # perform validation once every ten times
                    continue

            # loop each mini-batch from data loader
            for images, targets in dataloaders_dict[phase]:

                images = images.to(device)
                targets = [ann.to(device) for ann in targets]

                # initialize optimizer
                optimizer.zero_grad()

                # calculate forward
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(images)

                    # calculate loss
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    if phase == 'train':
                        # back propagate when training
                        loss.backward()  # calculate gradient

                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=Parameters.CLIP_VALUE)

                        optimizer.step()  # update parameters

                        if iteration % 10 == 0:  # display loss once every ten iterations
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print(
                                'iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.'
                                .format(iteration, loss.item(), duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item()
                        iteration += 1

                    else:
                        epoch_val_loss += loss.item()

        # loss and accuracy rate of each phase of epoch
        t_epoch_finish = time.time()

        # keep latest epoch loss
        if epoch_train_loss != 0.0:
            num_total = len(dataloaders_dict['train'])
            latest_epoch_train_loss = epoch_train_loss / num_total
        if epoch_val_loss != 0.0:
            num_total = len(dataloaders_dict['val'])
            latest_epoch_val_loss = epoch_val_loss / num_total

        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} || Epoch_VAL_Loss:{:.4f}'.
              format(epoch + 1, latest_epoch_train_loss,
                     latest_epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        statistics(epoch + 1, latest_epoch_train_loss, None,
                   latest_epoch_val_loss, None)

        writer.add_scalar('main/loss', latest_epoch_train_loss, epoch + 1)
        if (epoch + 1) % 10 == 0:
            writer.add_scalar('test/loss', latest_epoch_val_loss, epoch + 1)

            model_path = os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR,
                                      f'ssd300_{str(epoch + 1)}.pth')
            torch.save(net.state_dict(), model_path)

        writer.flush()
        epoch_train_loss = 0.0
        epoch_val_loss = 0.0

    torch.save(net.state_dict(),
               os.path.join(Parameters.ABEJA_TRAINING_RESULT_DIR, 'model.pth'))
    writer.close()
Exemplo n.º 2
0
def train(train_config):
    logger = Logger(HOME+'/log', train_config.basenet)
    if train_config.dataset_name == 'VOC':
        cfg = voc_config
        dataset = VOCDataset(DATA_DIR, transform=SSDAugmentation(
            cfg['min_dim'], MEANS))
    elif train_config.dataset_name == 'COCO':
        cfg = coco_config
        dataset = COCODataset(DATA_DIR, transform=SSDAugmentation(
            cfg['min_dim'], MEANS))

    if train_config.visdom:
        import visdom
        viz = visdom.Visdom()

    ssd_net = SSD('train', train_config.basenet,
                  cfg['min_dim'], cfg['num_classes'], with_fpn=train_config.with_fpn)
    net = ssd_net
    if train_config.cuda:
        net = nn.DataParallel(ssd_net)
        cudnn.benchmark = True
    if train_config.resume:
        logger('Loading {} ...'.format(train_config.resume))
        load_weights = torch.load(
            train_config.resume, map_location=lambda storage, loc: storage)
        ssd_net.load_state_dict(load_weights)
    if train_config.cuda:
        net = net.cuda()
    if not train_config.resume:
        logger('Initializing weights ...')
        ssd_net.topnet.apply(weights_init)
        ssd_net.loc_layers.apply(weights_init)
        ssd_net.conf_layers.apply(weights_init)

    optimizer = optim.Adam(net.parameters(), lr=train_config.lr,
                           weight_decay=train_config.weight_decay)
    criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
                             False, train_config.cuda)

    net.train()
    # loss counters
    loc_loss = 0
    conf_loss = 0
    epoch = 0
    logger('Loading the dataset...')

    epoch_size = len(dataset) // train_config.batch_size
    logger('Training SSD on:{}'.format(dataset.name))
    # logger('using the specified args:')

    step_index = 0

    if train_config.visdom:
        vis_title = 'SSD.PyTorch on ' + dataset.name
        vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
        iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend)
        epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend)

    data_loader = data.DataLoader(dataset, train_config.batch_size,
                                  num_workers=train_config.num_workers,
                                  shuffle=True, collate_fn=detection_collate,
                                  pin_memory=True)
    # create batch iterator
    batch_iterator = iter(data_loader)
    t0 = time.time()
    for iteration in range(train_config.start_iter, cfg['max_iter']):
        if train_config.visdom and iteration != 0 and (iteration % epoch_size == 0):
            update_vis_plot(epoch, loc_loss.item(), conf_loss.item(), epoch_plot, None,
                            'append', epoch_size)
            logger('epoch = {} : loss = {}, loc_loss = {}, conf_loss = {}'.format(
                epoch, loc_loss + conf_loss, loc_loss, conf_loss))
            # reset epoch loss counters
            loc_loss = 0
            conf_loss = 0
            epoch += 1

        if iteration in cfg['lr_steps']:
            step_index += 1
            adjust_learning_rate(optimizer, train_config.lr,
                                 train_config.gamma, step_index)

        # load train data
        images, targets = next(batch_iterator)

        if iteration//epoch_size > 0 and iteration % epoch_size == 0:
            batch_iterator = iter(data_loader)
            print(iteration)

        if train_config.cuda:
            images = images.cuda()
            targets = [ann.cuda()for ann in targets]
        # else:
        #     images=torch.tensor(images)
        #     targets=torch.tensor(targets)
        # forward

        out = net(images)
        # backprop
        optimizer.zero_grad()
        loss_l, loss_c = criterion(out, targets)
        loss = loss_l + loss_c
        loss.backward()
        optimizer.step()
        t1 = time.time()
        if train_config.visdom:
            loc_loss += loss_l.item()
            conf_loss += loss_c.item()

        if iteration % 50 == 0:
            t1 = time.time()
            logger('timer: %.4f sec. || ' % (t1 - t0)+'iter ' + repr(iteration) +
                   ' || Loss: %.4f ||' % (loss.item()) +
                   ' || loc_loss: %.4f ||' % (loss_l.item()) +
                   ' || conf_loss: %.4f ||' % (loss_c.item()))
            t0 = time.time()

        if train_config.visdom:
            update_vis_plot(iteration, loss_l.item(), loss_c.item(),
                            iter_plot, epoch_plot, 'append')

        if iteration != 0 and iteration % 5000 == 0:
            logger('Saving state, iter:%d' % iteration)
            torch.save(ssd_net.state_dict(), train_config.save_folder +
                       'ssd224_VOC_' + repr(iteration) + '.pth')
    torch.save(ssd_net.state_dict(),
               train_config.save_folder + 'ssd224_VOC.pth')
Exemplo n.º 3
0
def train(train_file, test_file, num_epoch):
    use_gpu = torch.cuda.is_available()
    Loss = MultiBoxLoss_2()  ## loss
    learning_rate = 0.01
    num_epochs = num_epoch
    batch_size = 4
    model = SSD(depth=50, width=1)
    #optimizer = torch.optim.SGD([{"params":model.parameters()}], lr=learning_rate, momentum=0.9, weight_decay=5e-4)
    optimizer = torch.optim.Adam([{
        "params": model.parameters()
    }],
                                 lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer)

    if use_gpu:
        model.cuda()

    model.train()

    train_dataset = ListDataset(root='GUN/WeaponS/',
                                list_file=train_file,
                                train=True,
                                transform=transforms.ToTensor())
    train_loader = DataLoader(train_dataset,
                              batch_size=16,
                              shuffle=True,
                              num_workers=2)
    test_dataset = ListDataset(root='GUN/WeaponS/',
                               list_file=test_file,
                               train=True,
                               transform=transforms.ToTensor())
    test_loader = DataLoader(test_dataset,
                             batch_size=16,
                             shuffle=True,
                             num_workers=2)

    for epoch in range(num_epochs):
        t1 = time.time()
        model.train()

        total_loss, valid_loss = 0, 0

        # Adjust learninig rate

        ## train model
        print("Train {} epoch: ".format(epoch + 1))
        for i, (imgs, loc, conf) in enumerate(train_loader):
            imgs, loc, conf = Variable(imgs), Variable(loc), Variable(conf)
            if use_gpu:
                imgs = imgs.cuda()
                loc = loc.cuda()
                conf = conf.cuda()
            loc_pred, con_pred = model(imgs)

            loss = Loss(loc_pred, loc, con_pred, conf)
            total_loss += loss.item()
            #loss = conf_loss + loc_loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #print('Training progress %.1f %%' %(100*(i+1)/len(train_loader)), end='')
        #print('loc loss: ', loc_loss_total/len(train_loader))
        #print('conf loss: ', conf_loss_total/len(train_loader))
        print('\rEpoch [%d/%d], Training loss: %.4f' %
              (epoch + 1, num_epochs, total_loss / len(train_loader)),
              end='\n')

        ## test model

        model.eval()
        with torch.no_grad():
            for i, (imgs, loc, conf) in enumerate(test_loader):
                imgs, loc, conf = Variable(imgs), Variable(loc), Variable(conf)
                if use_gpu:
                    imgs = imgs.cuda()
                    loc = loc.cuda()
                    conf = conf.cuda()
                loc_pred, con_pred = model(imgs)
                loss = Loss(loc_pred, loc, con_pred, conf)
                valid_loss += loss.item()

                #print('Validing progress %.1f %%' %(100*(i+1)/len(test_loader)), end='')
            print('\rEpoch [%d/%d], Validing loss: %.4f' %
                  (epoch + 1, num_epochs, valid_loss / len(test_loader)),
                  end='\n')
            print('\n')
        scheduler.step(valid_loss)

        t2 = time.time()
        #print('epoch escape time %f secs' %t2-t1)

        # Save model
        #PATH_1 = 'drive/My Drive/BootCamp4/SSD/ssd_2.pki'
        #torch.save(model, PATH_1)

        PATH = 'drive/My Drive/BootCamp4/SSD/ssd_state_dict.pki'
        torch.save(model.state_dict(), PATH)