예제 #1
0
def train(train_loader, model, scheduler, optimizer, epoch, args):
    global iteration
    print("{} epoch: \t start training....".format(epoch))
    start = time.time()
    total_loss = []
    model.train()
    model.module.is_training = True
    model.module.freeze_bn()
    optimizer.zero_grad()
    for idx, (images, annotations) in enumerate(train_loader):
        images = images.cuda().float()
        annotations = annotations.cuda()
        classification_loss, regression_loss = model([images, annotations])
        classification_loss = classification_loss.mean()
        regression_loss = regression_loss.mean()
        loss = classification_loss + regression_loss
        if bool(loss == 0):
            print('loss equal zero(0)')
            continue
        loss.backward()
        if (idx + 1) % args.grad_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            optimizer.step()
            optimizer.zero_grad()

        total_loss.append(loss.item())
        if (iteration % 1000 == 0):
            print('{} iteration: training ...'.format(iteration))
            ans = {
                'epoch': epoch,
                'iteration': iteration,
                'cls_loss': classification_loss.item(),
                'reg_loss': regression_loss.item(),
                'mean_loss': np.mean(total_loss)
            }
            for key, value in ans.items():
                print('    {:15s}: {}'.format(str(key), value))

            # My copy
            state = {
                'epoch': epoch,
                'parser': args,
                'state_dict': get_state_dict(model)
            }
            torch.save(
                state,
                os.path.join(args.save_folder, args.dataset, args.network,
                             "checkpoint_{}_{}.pth".format(epoch, iteration)))

        iteration += 1
    scheduler.step(np.mean(total_loss))
    result = {'time': time.time() - start, 'loss': np.mean(total_loss)}
    for key, value in result.items():
        print('    {:15s}: {}'.format(str(key), value))
예제 #2
0
 def load_pretrained_model(self, pretrained_model_file=None, skip=[]):
     if pretrained_model_file:
         pretrain_state_dict = get_state_dict(pretrained_model_file)
         state_dict = self.state_dict()
         keys = list(state_dict.keys())
         for key in keys:
             if any(s in key for s in skip):
                 continue
             try:
                 state_dict[key] = pretrain_state_dict[key]
             except KeyError:
                 print("KeyError: {} dosen't lie in pretrain state dict".format(key))
                 continue
     else:
         state_dict = model_zoo.load_url(model_urls[self.name])
     self.load_state_dict(state_dict)
     pass
예제 #3
0
 def load_pretrained_model(self, pretrained_model_file=None, skip=[]):
     if pretrained_model_file:
         pretrain_state_dict = get_state_dict(pretrained_model_file)
         state_dict = self.state_dict()
         keys = list(state_dict.keys())
         for key in keys:
             if any(s in key for s in skip):
                 continue
             try:
                 state_dict[key] = pretrain_state_dict[key]
             except KeyError:
                 print("KeyError: {} dosen't lie in pretrain state dict".format(key))
                 continue
     else:
         state_dict = model_zoo.load_url('https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth')
     self.load_state_dict(state_dict)
     pass
예제 #4
0
def main(cfg, state, plot=False):

    # Dataloaders
    dataset = LeddartechDataset(cfg, use_test_set=True)
    test_loader = DataLoader(dataset,
                             batch_size=cfg['TRAINING']['BATCH_SIZE'],
                             num_workers=cfg['TRAINING']['NUM_WORKERS'])
    print(f"Dataset size: {len(dataset)}")

    # Model
    in_channels = dataset.check_number_channels()
    model = getattr(models, cfg['NEURAL_NET']['NAME'])(cfg, in_channels)
    print(f"Model size: {model.size_of_net}")
    if cfg['TRAINING']['DEVICE'] == 'cuda' and torch.cuda.device_count(
    ) > 1:  #Multi GPUs
        model = torch.nn.DataParallel(model)
    model.to(cfg['TRAINING']['DEVICE'])
    print(f"Device set to: {cfg['TRAINING']['DEVICE']}")

    # Load model state
    state_dict = get_state_dict(state, device=cfg['TRAINING']['DEVICE'])
    model.load_state_dict(state_dict)
    model.eval()

    # Evaluator engine
    eval_metrics = {}
    for metric in cfg['TRAINING']['METRICS']:
        eval_metrics[metric] = getattr(metrics, metric)(
            cfg, **cfg['TRAINING']['METRICS'][metric])
    evaluator = create_supervised_evaluator(model,
                                            metrics=eval_metrics,
                                            device=cfg['TRAINING']['DEVICE'])
    pbar2 = tqdm_logger.ProgressBar(persist=True, desc='Testing')
    pbar2.attach(evaluator)

    # Start testing
    evaluator.run(test_loader)
    print('Test results: ', evaluator.state.metrics)

    if plot:
        for metric in cfg['TRAINING']['METRICS']:
            if hasattr(eval_metrics[metric], 'make_plot'):
                eval_metrics[metric].make_plot(evaluator.state.metrics)

    return evaluator.state.metrics
예제 #5
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            # args.rank = int(os.environ["RANK"])
            args.rank = 1
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    checkpoint = []
    if (args.resume is not None):
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
        params = checkpoint['parser']
        args.num_class = params.num_class
        args.network = params.network
        args.start_epoch = params.start_epoch + 1
        del params

    model = EfficientDet(num_classes=args.num_class,
                         network=args.network,
                         W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
                         D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
                         D_class=EFFICIENTDET[args.network]['D_class'],
                         gpu=args.gpu)
    if (args.resume is not None):
        model.load_state_dict(checkpoint['state_dict'])
    del checkpoint
    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu], find_unused_parameters=True)
            print('Run with DistributedDataParallel with divice_ids....')
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
            print('Run with DistributedDataParallel without device_ids....')
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        print('Run with DataParallel ....')
        model = torch.nn.DataParallel(model).cuda()

    # Training dataset
    train_dataset = []
    if (args.dataset == 'VOC'):
        #         train_dataset = VOCDetection(root=args.dataset_root,
        #                                      transform=get_augumentation(phase='train', width=EFFICIENTDET[args.network]['input_size'], height=EFFICIENTDET[args.network]['input_size']))
        train_dataset = VOCDetection(root=args.dataset_root,
                                     transform=transforms.Compose([
                                         Normalizer(),
                                         Augmenter(),
                                         Resizer()
                                     ]))

    elif (args.dataset == 'COCO'):
        train_dataset = CocoDataset(
            root_dir=args.dataset_root,
            set_name='train2017',
            transform=get_augumentation(
                phase='train',
                width=EFFICIENTDET[args.network]['input_size'],
                height=EFFICIENTDET[args.network]['input_size']))


#     train_loader = DataLoader(train_dataset,
#                                   batch_size=args.batch_size,
#                                   num_workers=args.workers,
#                                   shuffle=True,
#                                   collate_fn=detection_collate,
#                                   pin_memory=True)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.workers,
                              shuffle=True,
                              collate_fn=collater,
                              pin_memory=True)
    # define loss function (criterion) , optimizer, scheduler
    optimizer = optim.AdamW(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     patience=3,
                                                     verbose=True)
    cudnn.benchmark = True

    for epoch in range(args.start_epoch, args.num_epoch):
        train(train_loader, model, scheduler, optimizer, epoch, args)
        state = {
            'epoch': epoch,
            'parser': args,
            'state_dict': get_state_dict(model)
        }
        torch.save(
            state,
            './weights/checkpoint_{}_{}_{}.pth'.format(args.dataset,
                                                       args.network, epoch))