예제 #1
0
def train(hyp):
    epochs = opt.epochs  # 300
    batch_size = opt.batch_size  # 64
    #weights = opt.weights  # initial training weights
    random.seed(42)
    np.random.seed(42)
    torch_utils.init_seeds(42)
    # Configure

    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
    #train_path = data_dict['train']
    #test_path = data_dict['val']
    nc = 1 if opt.single_cls else int(data_dict['nc'])  # number of classes

    # Remove previous results
    for f in glob.glob('*_batch*.jpg') + glob.glob(results_file):
        os.remove(f)

    # Create model

    config = get_efficientdet_config('tf_efficientdet_d4')
    # 根据上面的配置生成网络
    load_from_pretrained = True

    if (load_from_pretrained):
        model = EfficientDet(config, pretrained_backbone=False)
        # 加载预训练模型
        checkpoint = torch.load(r'./tf_efficientdet_d4-5b370b7a.pth',
                                map_location=device)
        try:
            exclude = ['running_mean',
                       'running_var']  #['anchor', ,,'bn','tracked',]
            checkpoint = {
                k: v
                for k, v in checkpoint.items()
                if k in model.state_dict() and not any(x in k for x in exclude)
                and model.state_dict()[k].shape == v.shape
            }
            model.load_state_dict(checkpoint, strict=False)

            print('Transferred %g/%g items from ' %
                  (len(checkpoint), len(model.state_dict())))
        except KeyError as e:
            s = " is not compatible with . This may be due to model differences or %s may be out of date. " \
                "Please delete or update  and try again, or use --weights '' to train from scratch."

            raise KeyError(s) from e

        config.num_classes = 1
        config.image_size = opt.img_size[0]
        model.class_net = HeadNet(config,
                                  num_outputs=config.num_classes,
                                  norm_kwargs=dict(eps=.001, momentum=.01))
    else:  # load from best,last
        config.num_classes = 1
        config.image_size = opt.img_size[0]
        model = EfficientDet(config, pretrained_backbone=False)
        checkpoint = torch.load(r'./weights/last.pt', map_location=device)  #
        model.load_state_dict(checkpoint['model'].model.state_dict())
        print("load from last.pt\n")

    config.loss_type = opt.loss_type
    model = DetBenchTrain(model, config)
    print("effDet config:", config)

    imgsz, imgsz_test = [x for x in opt.img_size
                         ]  # verify imgsz are gs-multiples

    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / batch_size),
                     1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= batch_size * accumulate / nbs  # scale weight_decay
    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
    for k, v in model.named_parameters():
        if v.requires_grad:
            if '.bias' in k:
                pg0.append(v)
                #pg2.append(v)  # biases
                #print("bias:",k)
            elif ('.weight' in k or '.edge_weights' in k) and '.bn' not in k:
                pg1.append(v)  # apply weight decay
                #print("weight:",k)
            else:
                pg0.append(v)  # all else
                #print("else:",k)

    optimizer = optim.Adam(pg0, lr=hyp['lr0']) if opt.adam else \
        optim.RMSprop(pg0, lr=hyp['lr0'])
    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay

    lf = lambda x: ((
        (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1  # cosine
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    print('Optimizer groups: %g .bias, %g conv.weight, %g other' %
          (len(pg2), len(pg1), len(pg0)))
    del pg0, pg1, pg2

    # Load Model

    start_epoch, best_fitness = 0, 1000.0
    if load_from_pretrained == False:
        if checkpoint['optimizer_state_dict'] is not None:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            best_fitness = checkpoint['best_summary_loss']
            print("load best loss:", best_fitness)
        if checkpoint['epoch'] is not None:
            start_epoch = checkpoint['epoch'] + 1
            if epochs < start_epoch:
                print(
                    '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.'
                    % (opt.weights, checkpoint['epoch'], epochs))
                epochs += checkpoint['epoch']  # finetune additional epochs
    del checkpoint

    # Mixed precision training https://github.com/NVIDIA/apex
    model.to(device)
    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    scheduler.last_epoch = start_epoch - 1  # do not move

    # Initialize distributed training
    distribution = False

    # Trainloader
    dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=RandomSampler(train_dataset),
        pin_memory=True,  #opt.cache_images,
        drop_last=True,
        num_workers=4,
        collate_fn=collate_fn,
    )

    # Testloader
    testloader = torch.utils.data.DataLoader(
        validation_dataset,
        batch_size=batch_size,
        num_workers=3,
        shuffle=False,
        sampler=SequentialSampler(validation_dataset),
        pin_memory=True,  #opt.cache_images,
        collate_fn=collate_fn,
    )

    # Exponential moving average
    ema = torch_utils.ModelEMA(model)
    #print("!!!!!!!!!!!!!!!!!! type model")
    #print(type(model))
    # Start training
    t0 = time.time()
    nb = len(dataloader)  #//4  # number of batches
    n_burn = max(2 * nb,
                 1e3)  # burn-in iterations, max(3 epochs, 1k iterations)
    maps = np.zeros(nc)  # mAP per class
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
    print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
    print('Using %g dataloader workers' % dataloader.num_workers)
    print('Starting training for %g epochs...' % epochs)
    #anchor = Anchor_config(config)
    #anchor.anchors.to(device)
    #anchor.anchor_labeler.to(device)
    # torch.autograd.set_detect_anomaly(True)
    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()

        mloss = torch.zeros(3, device='cpu')  # mean losses
        print(
            ('\n' + '%10s' * 7) %
            ('Epoch', 'gpu_mem', 'box', 'cls', 'total', 'targets', 'img_size'))
        #ss = ('\n' + '%5d' * 7)%(0,0,0,0,0,0,0)

        pbar = tqdm(enumerate(dataloader), ncols=180, total=nb)  # progress bar
        for i, (
                images, targets, image_ids
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            #imgs = imgs.to(device).float() / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
            boxes = [target['boxes'].to(device).float()
                     for target in targets]  # yxyx?
            labels = [
                target['labels'].to(device).float() for target in targets
            ]
            images = torch.stack(images, 0)
            images = images.to(device)  #.float()
            batch_size = images.shape[0]

            # Burn-in

            if ni <= n_burn:
                xi = [0, n_burn]  # x interp
                # model.gr = np.interp(ni, xi, [0.0, 1.0])  # giou loss ratio (obj_loss = 1.0 or giou)
                accumulate = max(
                    1,
                    np.interp(ni, xi, [1, nbs / batch_size]).round())
                for j, x in enumerate(optimizer.param_groups):
                    # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
                    x['lr'] = np.interp(
                        ni, xi,
                        [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
                    if 'momentum' in x:
                        x['momentum'] = np.interp(ni, xi,
                                                  [0.9, hyp['momentum']])

            # Multi-scale
            if opt.multi_scale:
                sz = random.randrange(imgsz * 0.5,
                                      imgsz * 1.0 + gs) // gs * gs  # size
                sf = sz / max(images.shape[2:])  # scale factor
                if sf != 1:
                    ns = [
                        math.ceil(x * sf / gs) * gs for x in images.shape[2:]
                    ]  # new shape (stretched to gs-multiple)
                    images = F.interpolate(images,
                                           size=ns,
                                           mode='bilinear',
                                           align_corners=False)

            total_loss, cls_loss, box_loss = model(images, boxes, labels)
            total_loss = torch.mean(total_loss)
            cls_loss = torch.mean(cls_loss)
            box_loss = torch.mean(box_loss)
            if not torch.isfinite(total_loss):
                print('WARNING: non-finite loss, ending training ', cls_loss,
                      box_loss)
                return results

            # Backward
            if mixed_precision:
                with amp.scale_loss(total_loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                total_loss.backward()

            # Optimize
            if ni % accumulate == 0:

                optimizer.step()
                optimizer.zero_grad()

                ema.update(model)

# Print

            mloss = (mloss * i + torch.tensor(
                [box_loss * 50.0, cls_loss, total_loss]).detach()) / (
                    i + 1)  # update mean losses
            mem = '%.3gG' % (torch.cuda.memory_cached() /
                             1E9 if torch.cuda.is_available() else 0)  # (GB)
            s = ('%10s' * 2 + '%10.4g' * 5) % ('%g/%g' % (epoch, epochs - 1),
                                               mem, *mloss, boxes[0].shape[0],
                                               images.shape[-1])
            pbar.set_description(s)

            if ni < 3:
                f = 'train_batch%g.jpg' % ni  # filename
                result = plot_images(images=images, targets=boxes, fname=f)

            # end batch ------------------------------------------------------------------------------------------------

        # Scheduler
        scheduler.step()

        # mAP

        final_epoch = epoch + 1 == epochs
        if not opt.notest or final_epoch:  # Calculate mAP
            result = validation(model=ema.ema,
                                val_loader=testloader,
                                config=config,
                                device=device)

            #results, maps, times = test.test(opt.data,
            #                                 batch_size=batch_size,
            #                                 imgsz=imgsz_test,
            #                                 save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'),
            #                                 model=ema.ema,
            #                                 single_cls=opt.single_cls,
            #                                 dataloader=testloader)

            print("val:", result.avg)

        # Write
        with open(results_file, 'a') as f:
            f.write(
                f'[RESULT]:Train loss:{total_loss:.5f} Val. Epoch: {epoch}, summary_loss: {result.avg:.5f} \n'
            )  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
        #if len(opt.name) and opt.bucket:
        #    os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name))

        # Tensorboard

        # Update best mAP
        fi = result.avg  #fitness(np.array(results).reshape(1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
        if fi < best_fitness:
            best_fitness = fi
            print("best_fit,\n")
        # Save model
        save = (not opt.nosave) or (final_epoch and not opt.evolve)
        if save:
            #with open(results_file, 'r') as f:  # create checkpoint

            #ckpt = {'epoch': epoch,
            #        'best_fitness': best_fitness,
            #        'training_results': f.read(),
            #        'model': ema.ema,
            #        'optimizer': None if final_epoch else optimizer.state_dict()}

            ckpt = {
                'model': ema.ema,
                #'model_state_dict': ema.ema.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_summary_loss': best_fitness,
                'epoch': epoch,
            }

            # Save last, best and delete
            torch.save(ckpt, last)
            if (best_fitness == fi) and not final_epoch:
                torch.save(ckpt, best)
            del ckpt

        # end epoch ----------------------------------------------------------------------------------------------------
    # end training

    # Strip optimizers
    n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name
    fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
    for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'],
                      [flast, fbest, fresults]):
        if os.path.exists(f1):
            os.rename(f1, f2)  # rename
            ispt = f2.endswith('.pt')  # is *.pt
            strip_optimizer(f2) if ispt else None  # strip optimizer
            os.system(
                'gsutil cp %s gs://%s/weights' %
                (f2, opt.bucket)) if opt.bucket and ispt else None  # upload

    # Finish
    #if not opt.evolve:
    #    plot_results()  # save as results.png
    print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1,
                                                    (time.time() - t0) / 3600))
    dist.destroy_process_group(
    ) if distribution and device.type != 'cpu' and torch.cuda.device_count(
    ) > 1 else None
    torch.cuda.empty_cache()
    return results
예제 #2
0
def validate(args):
    # might as well try to validate something
    args.pretrained = args.pretrained or not args.checkpoint
    args.prefetcher = not args.no_prefetcher
    args.redundant_bias = not args.no_redundant_bias

    # create model
    config = get_efficientdet_config(args.model)
    config.redundant_bias = args.redundant_bias
    model = EfficientDet(config)
    if args.checkpoint:
        load_checkpoint(model, args.checkpoint)

    param_count = sum([m.numel() for m in model.parameters()])
    print('Model %s created, param count: %d' % (args.model, param_count))

    bench = DetBenchEval(model, config)
    bench = bench.cuda()
    if has_amp:
        print('Using AMP mixed precision.')
        bench = amp.initialize(bench, opt_level='O1')
    else:
        print('AMP not installed, running network in FP32.')

    if args.num_gpu > 1:
        bench = torch.nn.DataParallel(bench,
                                      device_ids=list(range(args.num_gpu)))

    if 'test' in args.anno:
        annotation_path = os.path.join(args.data, 'annotations',
                                       f'image_info_{args.anno}.json')
        image_dir = 'test2017'
    else:
        annotation_path = os.path.join(args.data, 'annotations',
                                       f'instances_{args.anno}.json')
        image_dir = args.anno
    dataset = CocoDetection(os.path.join(args.data, image_dir),
                            annotation_path)

    loader = create_loader(dataset,
                           input_size=config.image_size,
                           batch_size=args.batch_size,
                           use_prefetcher=args.prefetcher,
                           interpolation=args.interpolation,
                           fill_color=args.fill_color,
                           num_workers=args.workers,
                           pin_mem=args.pin_mem)

    img_ids = []
    results = []
    model.eval()
    batch_time = AverageMeter()
    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(loader):
            output = bench(input, target['scale'])
            output = output.cpu()
            sample_ids = target['img_id'].cpu()
            for index, sample in enumerate(output):
                image_id = int(sample_ids[index])
                for det in sample:
                    score = float(det[4])
                    if score < .001:  # stop when below this threshold, scores in descending order
                        break
                    coco_det = dict(image_id=image_id,
                                    bbox=det[0:4].tolist(),
                                    score=score,
                                    category_id=int(det[5]))
                    img_ids.append(image_id)
                    results.append(coco_det)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.log_freq == 0:
                print(
                    'Test: [{0:>4d}/{1}]  '
                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
                    .format(
                        i,
                        len(loader),
                        batch_time=batch_time,
                        rate_avg=input.size(0) / batch_time.avg,
                    ))

    json.dump(results, open(args.results, 'w'), indent=4)
    if 'test' not in args.anno:
        coco_results = dataset.coco.loadRes(args.results)
        coco_eval = COCOeval(dataset.coco, coco_results, 'bbox')
        coco_eval.params.imgIds = img_ids  # score only ids we've used
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()

    return results
예제 #3
0
def main():
    setup_default_logging()
    args, args_text = _parse_args()

    args.prefetcher = not args.no_prefetcher
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
    args.device = 'cuda:0'
    args.world_size = 1
    args.rank = 0  # global rank
    if args.distributed:
        args.device = 'cuda:%d' % args.local_rank
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
        args.rank = torch.distributed.get_rank()
    assert args.rank >= 0

    if args.distributed:
        logging.info(
            'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
            % (args.rank, args.world_size))
    else:
        logging.info('Training with a single process on 1 GPU.')

    torch.manual_seed(args.seed + args.rank)

    # create model
    config = get_efficientdet_config(args.model)
    config.redundant_bias = args.redundant_bias  # redundant conv + BN bias layers (True to match official models)
    model = EfficientDet(config)
    model = DetBenchTrain(model, config)

    # FIXME create model factory, pretrained zoo
    # model = create_model(
    #     args.model,
    #     pretrained=args.pretrained,
    #     num_classes=args.num_classes,
    #     drop_rate=args.drop,
    #     drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
    #     drop_path_rate=args.drop_path,
    #     drop_block_rate=args.drop_block,
    #     global_pool=args.gp,
    #     bn_tf=args.bn_tf,
    #     bn_momentum=args.bn_momentum,
    #     bn_eps=args.bn_eps,
    #     checkpoint_path=args.initial_checkpoint)

    if args.local_rank == 0:
        logging.info('Model %s created, param count: %d' %
                     (args.model, sum([m.numel()
                                       for m in model.parameters()])))

    model.cuda()
    optimizer = create_optimizer(args, model)
    use_amp = False
    if has_apex and args.amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        use_amp = True
    if args.local_rank == 0:
        logging.info('NVIDIA APEX {}. AMP {}.'.format(
            'installed' if has_apex else 'not installed',
            'on' if use_amp else 'off'))

    # optionally resume from a checkpoint
    resume_state = {}
    resume_epoch = None
    if args.resume:
        resume_state, resume_epoch = resume_checkpoint(_unwrap_bench(model),
                                                       args.resume)
    if resume_state and not args.no_resume_opt:
        if 'optimizer' in resume_state:
            if args.local_rank == 0:
                logging.info('Restoring Optimizer state from checkpoint')
            optimizer.load_state_dict(resume_state['optimizer'])
        if use_amp and 'amp' in resume_state and 'load_state_dict' in amp.__dict__:
            if args.local_rank == 0:
                logging.info('Restoring NVIDIA AMP state from checkpoint')
            amp.load_state_dict(resume_state['amp'])
    del resume_state

    model_ema = None
    if args.model_ema:
        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
        model_ema = ModelEma(model,
                             decay=args.model_ema_decay,
                             device='cpu' if args.model_ema_force_cpu else '')
        #resume=args.resume)  # FIXME bit of a mess with bench
        if args.resume:
            load_checkpoint(_unwrap_bench(model_ema),
                            args.resume,
                            use_ema=True)

    if args.distributed:
        if args.sync_bn:
            try:
                if has_apex:
                    model = convert_syncbn_model(model)
                else:
                    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
                        model)
                if args.local_rank == 0:
                    logging.info(
                        'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
                        'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.'
                    )
            except Exception as e:
                logging.error(
                    'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1'
                )
        if has_apex:
            model = DDP(model, delay_allreduce=True)
        else:
            if args.local_rank == 0:
                logging.info(
                    "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP."
                )
            model = DDP(model,
                        device_ids=[args.local_rank
                                    ])  # can use device str in Torch >= 1.1
        # NOTE: EMA model does not need to be wrapped by DDP

    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
    start_epoch = 0
    if args.start_epoch is not None:
        # a specified start_epoch will always override the resume epoch
        start_epoch = args.start_epoch
    elif resume_epoch is not None:
        start_epoch = resume_epoch
    if lr_scheduler is not None and start_epoch > 0:
        lr_scheduler.step(start_epoch)

    if args.local_rank == 0:
        logging.info('Scheduled epochs: {}'.format(num_epochs))

    train_anno_set = 'train2017'
    train_annotation_path = os.path.join(args.data, 'annotations',
                                         f'instances_{train_anno_set}.json')
    train_image_dir = train_anno_set
    dataset_train = CocoDetection(os.path.join(args.data, train_image_dir),
                                  train_annotation_path)

    # FIXME cutmix/mixup worth investigating?
    # collate_fn = None
    # if args.prefetcher and args.mixup > 0:
    #     collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes)

    loader_train = create_loader(
        dataset_train,
        input_size=config.image_size,
        batch_size=args.batch_size,
        is_training=True,
        use_prefetcher=args.prefetcher,
        #re_prob=args.reprob,  # FIXME add back various augmentations
        #re_mode=args.remode,
        #re_count=args.recount,
        #re_split=args.resplit,
        #color_jitter=args.color_jitter,
        #auto_augment=args.aa,
        interpolation=args.train_interpolation,
        #mean=data_config['mean'],
        #std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
        #collate_fn=collate_fn,
        pin_mem=args.pin_mem,
    )

    train_anno_set = 'val2017'
    train_annotation_path = os.path.join(args.data, 'annotations',
                                         f'instances_{train_anno_set}.json')
    train_image_dir = train_anno_set
    dataset_eval = CocoDetection(os.path.join(args.data, train_image_dir),
                                 train_annotation_path)

    loader_eval = create_loader(
        dataset_eval,
        input_size=config.image_size,
        batch_size=args.validation_batch_size_multiplier * args.batch_size,
        is_training=False,
        use_prefetcher=args.prefetcher,
        interpolation=args.interpolation,
        #mean=data_config['mean'],
        #std=data_config['std'],
        num_workers=args.workers,
        #distributed=args.distributed,
        pin_mem=args.pin_mem,
    )

    eval_metric = args.eval_metric
    best_metric = None
    best_epoch = None
    saver = None
    output_dir = ''
    if args.local_rank == 0:
        output_base = args.output if args.output else './output'
        exp_name = '-'.join(
            [datetime.now().strftime("%Y%m%d-%H%M%S"), args.model])
        output_dir = get_outdir(output_base, 'train', exp_name)
        decreasing = True if eval_metric == 'loss' else False
        saver = CheckpointSaver(checkpoint_dir=output_dir,
                                decreasing=decreasing)
        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
            f.write(args_text)

    try:
        for epoch in range(start_epoch, num_epochs):
            if args.distributed:
                loader_train.sampler.set_epoch(epoch)

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        args,
                                        lr_scheduler=lr_scheduler,
                                        saver=saver,
                                        output_dir=output_dir,
                                        use_amp=use_amp,
                                        model_ema=model_ema)

            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
                if args.local_rank == 0:
                    logging.info(
                        "Distributing BatchNorm running means and vars")
                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')

            eval_metrics = validate(model, loader_eval, args)

            if model_ema is not None and not args.model_ema_force_cpu:
                if args.distributed and args.dist_bn in ('broadcast',
                                                         'reduce'):
                    distribute_bn(model_ema, args.world_size,
                                  args.dist_bn == 'reduce')

                ema_eval_metrics = validate(model_ema.ema,
                                            loader_eval,
                                            args,
                                            log_suffix=' (EMA)')
                eval_metrics = ema_eval_metrics

            if lr_scheduler is not None:
                # step LR for next epoch
                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])

            update_summary(epoch,
                           train_metrics,
                           eval_metrics,
                           os.path.join(output_dir, 'summary.csv'),
                           write_header=best_metric is None)

            if saver is not None:
                # save proper checkpoint with eval metric
                save_metric = eval_metrics[eval_metric]
                best_metric, best_epoch = saver.save_checkpoint(
                    _unwrap_bench(model),
                    optimizer,
                    args,
                    epoch=epoch,
                    model_ema=_unwrap_bench(model_ema),
                    metric=save_metric,
                    use_amp=use_amp)

    except KeyboardInterrupt:
        pass
    if best_metric is not None:
        logging.info('*** Best metric: {0} (epoch {1})'.format(
            best_metric, best_epoch))
def do_main():
    device = torch.device(f'cuda:{gpu_number}') if torch.cuda.is_available(
    ) else torch.device('cpu')
    print(device)

    print(len(train_boxes_df))
    print(len(train_images_df))

    # Leave only > 0
    print('Leave only train images with boxes (all)')
    with_boxes_filter = train_images_df[image_id_column].isin(
        train_boxes_df[image_id_column].unique())

    images_val = train_images_df.loc[(train_images_df[fold_column] == fold)
                                     & with_boxes_filter,
                                     image_id_column].values
    images_train = train_images_df.loc[(train_images_df[fold_column] != fold)
                                       & with_boxes_filter,
                                       image_id_column].values

    print(len(images_train), len(images_val))

    train_dataset = WheatDataset(images_train,
                                 DIR_TRAIN,
                                 train_box_callback,
                                 transforms=get_train_transform(),
                                 is_test=False)
    valid_dataset = WheatDataset(images_val,
                                 DIR_TRAIN,
                                 train_box_callback,
                                 transforms=get_valid_transform(),
                                 is_test=True)

    train_data_loader = DataLoader(train_dataset,
                                   batch_size=train_batch_size,
                                   shuffle=True,
                                   num_workers=num_workers,
                                   collate_fn=collate_fn)

    valid_data_loader = DataLoader(valid_dataset,
                                   batch_size=inf_batch_size,
                                   shuffle=False,
                                   num_workers=num_workers,
                                   collate_fn=collate_fn)

    #config = get_efficientdet_config('tf_efficientdet_d4')
    config = get_efficientdet_config('tf_efficientdet_d5')
    net = EfficientDet(config, pretrained_backbone=False)
    #load_weights(net, '../timm-efficientdet-pytorch/efficientdet_d4-5b370b7a.pth')
    load_weights(net,
                 '../timm-efficientdet-pytorch/efficientdet_d5-ef44aea8.pth')

    config.num_classes = 1
    config.image_size = our_image_size
    net.class_net = HeadNet(config,
                            num_outputs=config.num_classes,
                            norm_kwargs=dict(eps=.001, momentum=.01))

    fold_weights_file = f'{experiment_name}.pth'
    if os.path.exists(fold_weights_file):
        # continue training
        print('Continue training, loading weights: ' + fold_weights_file)
        load_weights(net, fold_weights_file)

    model_train = DetBenchTrain(net, config)
    model_eval = DetBenchEval(net, config)

    manager = ModelManager(model_train, model_eval, device)
    weights_file = f'{experiment_name}.pth'

    manager.run_train(train_data_loader,
                      valid_data_loader,
                      n_epoches=n_epochs,
                      weights_file=weights_file,
                      factor=factor,
                      start_lr=start_lr,
                      min_lr=min_lr,
                      lr_patience=lr_patience,
                      overall_patience=overall_patience,
                      loss_delta=loss_delta)

    # add tags
    neptune.log_text('save checkpoints as', weights_file[:-4])
    neptune.stop()
def validate(args):
    # might as well try to validate something
    args.pretrained = args.pretrained or not args.checkpoint
    args.prefetcher = not args.no_prefetcher

    # create model
    config = get_efficientdet_config(args.model)
    model = EfficientDet(config)
    if args.checkpoint:
        load_checkpoint(model, args.checkpoint)

    param_count = sum([m.numel() for m in model.parameters()])
    logging.info('Model %s created, param count: %d' %
                 (args.model, param_count))

    bench = DetBenchEval(model, config)

    bench.model = bench.model.cuda()
    if has_amp:
        bench.model = amp.initialize(bench.model, opt_level='O1')

    if args.num_gpu > 1:
        bench.model = torch.nn.DataParallel(bench.model,
                                            device_ids=list(range(
                                                args.num_gpu)))

    if 'test' in args.anno:
        annotation_path = os.path.join(args.data, 'annotations',
                                       f'image_info_{args.anno}.json')
        image_dir = 'test2017'
    else:
        annotation_path = os.path.join(args.data, 'annotations',
                                       f'instances_{args.anno}.json')
        image_dir = args.anno
    dataset = CocoDetection(os.path.join(args.data, image_dir),
                            annotation_path)

    loader = create_loader(dataset,
                           input_size=config.image_size,
                           batch_size=args.batch_size,
                           use_prefetcher=args.prefetcher,
                           interpolation=args.interpolation,
                           num_workers=args.workers)

    img_ids = []
    results = []
    model.eval()
    batch_time = AverageMeter()
    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(loader):
            output = bench(input, target['img_id'], target['scale'])
            for batch_out in output:
                for det in batch_out:
                    image_id = int(det[0])
                    score = float(det[5])
                    coco_det = {
                        'image_id': image_id,
                        'bbox': det[1:5].tolist(),
                        'score': score,
                        'category_id': int(det[6]),
                    }
                    img_ids.append(image_id)
                    results.append(coco_det)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.log_freq == 0:
                print(
                    'Test: [{0:>4d}/{1}]  '
                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
                    .format(
                        i,
                        len(loader),
                        batch_time=batch_time,
                        rate_avg=input.size(0) / batch_time.avg,
                    ))

    json.dump(results, open(args.results, 'w'), indent=4)
    if 'test' not in args.anno:
        coco_results = dataset.coco.loadRes(args.results)
        coco_eval = COCOeval(dataset.coco, coco_results, 'bbox')
        coco_eval.params.imgIds = img_ids  # score only ids we've used
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()

    return results
예제 #6
0
def do_main():
    neptune.init('ods/wheat')
    # Create experiment with defined parameters
    neptune.create_experiment(name=model_name,
                              params=PARAMS,
                              tags=[experiment_name, experiment_tag],
                              upload_source_files=[os.path.basename(__file__)])

    neptune.append_tags(f'fold_{fold}')
    neptune.append_tags(['grad_accum'])

    device = torch.device(f'cuda:{gpu_number}') if torch.cuda.is_available(
    ) else torch.device('cpu')
    print(device)

    print(len(train_boxes_df))
    print(len(train_images_df))

    # Leave only > 0
    print('Leave only train images with boxes (validation)')
    with_boxes_filter = train_images_df[image_id_column].isin(
        train_boxes_df[image_id_column].unique())

    # config models fro train and validation
    config = get_efficientdet_config('tf_efficientdet_d5')
    net = EfficientDet(config, pretrained_backbone=False)
    load_weights(net,
                 '../timm-efficientdet-pytorch/efficientdet_d5-ef44aea8.pth')

    config.num_classes = 1
    config.image_size = our_image_size
    net.class_net = HeadNet(config,
                            num_outputs=config.num_classes,
                            norm_kwargs=dict(eps=.001, momentum=.01))
    model_train = DetBenchTrain(net, config)
    model_eval = DetBenchEval(net, config)

    manager = ModelManager(model_train, model_eval, device)

    images_val = train_images_df.loc[(train_images_df[fold_column] == fold)
                                     & with_boxes_filter,
                                     image_id_column].values
    images_train = train_images_df.loc[(train_images_df[fold_column] != fold)
                                       & with_boxes_filter,
                                       image_id_column].values

    print(
        f'\nTrain images:{len(images_train)}, validation images {len(images_val)}'
    )

    # get augs
    #augs_dict = set_augmentations(our_image_size)

    # get datasets
    train_dataset = WheatDataset(
        image_ids=images_train[:160],
        image_dir=DIR_TRAIN,
        boxes_df=train_boxes_df,
        transforms=get_train_transform(our_image_size),
        is_test=False)
    valid_dataset = WheatDataset(
        image_ids=images_val[:160],
        image_dir=DIR_TRAIN,
        boxes_df=train_boxes_df,
        transforms=get_valid_transform(our_image_size),
        is_test=True)

    train_data_loader = DataLoader(train_dataset,
                                   batch_size=train_batch_size,
                                   shuffle=True,
                                   num_workers=num_workers,
                                   collate_fn=collate_fn,
                                   drop_last=True)

    valid_data_loader = DataLoader(valid_dataset,
                                   batch_size=inf_batch_size,
                                   shuffle=False,
                                   num_workers=num_workers,
                                   collate_fn=collate_fn)

    weights_file = f'../checkpoints/{model_name}/{experiment_name}.pth'

    #pretrain_weights_file = f'{checkpoints_dir}/{experiment_name}.pth'
    #if os.path.exists(pretrain_weights_file):
    #    print(f'Continue training, loading weights from {pretrain_weights_file}')
    #    load_weights(net, pretrain_weights_file)

    manager.run_train(train_generator=train_data_loader,
                      val_generator=valid_data_loader,
                      n_epoches=n_epochs,
                      weights_file=weights_file,
                      factor=factor,
                      start_lr=start_lr,
                      min_lr=min_lr,
                      lr_patience=lr_patience,
                      overall_patience=overall_patience,
                      loss_delta=loss_delta)

    # add tags
    neptune.log_text('save checkpoints as', weights_file[:-4])
    neptune.stop()
예제 #7
0
                                   batch_size=train_batch_size,
                                   shuffle=True,
                                   num_workers=num_workers,
                                   collate_fn=collate_fn)

    valid_data_loader = DataLoader(valid_dataset,
                                   batch_size=inf_batch_size,
                                   shuffle=False,
                                   num_workers=num_workers,
                                   collate_fn=collate_fn)

    #weights_file = 'effdet_model14_fold' + str(fold_) + '.pth'
    weights_file = '../Weights/effdet_fold_1_model16_alex_fold1.pth'
    #weights_file = 'effdet_alex_fold0.pth'

    config = get_efficientdet_config('tf_efficientdet_d5')
    net = EfficientDet(config, pretrained_backbone=False)
    config.num_classes = 1
    config.image_size = our_image_size
    net.class_net = HeadNet(config,
                            num_outputs=config.num_classes,
                            norm_kwargs=dict(eps=.001, momentum=.01))
    load_weights(net, weights_file)
    model = DetBenchEval(net, config)

    manager = ModelManager(model, device)

    true_list, pred_boxes, pred_scores = manager.predict(valid_data_loader)
    prob_thresholds = np.linspace(
        0.35, 0.45, num=10,
        endpoint=False)  # Prediction thresholds to consider a pixel positive