Пример #1
0
def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu):
    all_predictions = all_gather(predictions_per_gpu)
    if not is_main_process():
        return
    # merge the list of dicts
    predictions = {}
    for p in all_predictions:
        predictions.update(p)
    return predictions
Пример #2
0
        def save_func(filename=None, save_str=None):
            state_dict = {
                'model': model_without_ddp.state_dict(),
                # 'discriminator': dis_model_without_ddp.state_dict(),
                'current_epoch': epoch,
            }
            filename = filename if filename else 'model_epoch_{:02d}.pth'.format(
                epoch)
            save_path = os.path.join(work_dir, filename)
            dist_utils.save_on_master(state_dict, save_path)
            if dist_utils.is_main_process() and save_str is not None:
                with open(os.path.join(work_dir, 'best.txt'), 'w') as f:
                    f.write(save_str)

            print('Saved to {}'.format(save_path))
Пример #3
0
        def test_func():
            global best_mAP
            updated = False
            metrics = evaluation(model,
                                 test_loaders,
                                 device,
                                 cfg.TEST.EVAL_TYPES,
                                 output_dir=work_dir,
                                 iteration=global_step)
            if dist_utils.is_main_process() and losses_writer:
                for dataset_name, metric in metrics.items():
                    for k, v in metric.items():
                        metrics_writers[dataset_name].add_scalar(
                            'metrics/' + k, v, global_step=global_step)
                        # if k == 'mAP' and v > best_mAP:
                        if k == 'AP' and v > best_mAP:
                            best_mAP = v
                            updated = True
            model.train()

            return updated
Пример #4
0
def main(cfg, args):
    train_loader = build_data_loaders(cfg.DATASETS.TRAINS,
                                      transforms=cfg.INPUT.TRANSFORMS_TRAIN,
                                      is_train=True,
                                      distributed=args.distributed,
                                      batch_size=cfg.SOLVER.BATCH_SIZE,
                                      num_workers=cfg.DATALOADER.NUM_WORKERS)
    target_loader = build_data_loaders(cfg.DATASETS.TARGETS,
                                       transforms=cfg.INPUT.TRANSFORMS_TRAIN,
                                       is_train=True,
                                       distributed=args.distributed,
                                       batch_size=cfg.SOLVER.BATCH_SIZE,
                                       num_workers=cfg.DATALOADER.NUM_WORKERS)
    test_loaders = build_data_loaders(cfg.DATASETS.TESTS,
                                      transforms=cfg.INPUT.TRANSFORMS_TEST,
                                      is_train=False,
                                      distributed=args.distributed,
                                      num_workers=cfg.DATALOADER.NUM_WORKERS)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = build_detectors(cfg)
    model.to(device)

    # dis_model = Discriminator(cfg)
    # dis_model.to(device)

    model_without_ddp = model
    # dis_model_without_ddp = dis_model
    if args.distributed:
        model = DistributedDataParallel(convert_sync_batchnorm(model),
                                        device_ids=[args.gpu])

        # dis_model = DistributedDataParallel(dis_model, device_ids=[args.gpu])
        model_without_ddp = model.module
        # dis_model_without_ddp = dis_model.module

    # optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], cfg.SOLVER.LR, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY)
    optimizer = torch.optim.Adam(
        [p for p in model.parameters() if p.requires_grad],
        cfg.SOLVER.LR,
        betas=(0.9, 0.999),
        weight_decay=cfg.SOLVER.WEIGHT_DECAY)
    # dis_optimizer = torch.optim.Adam([p for p in dis_model.parameters() if p.requires_grad], cfg.SOLVER.LR, betas=(0.9, 0.999), weight_decay=cfg.SOLVER.WEIGHT_DECAY)

    schedulers = [
        torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                             cfg.SOLVER.STEPS,
                                             gamma=cfg.SOLVER.GAMMA),
        # torch.optim.lr_scheduler.MultiStepLR(dis_optimizer, cfg.SOLVER.STEPS, gamma=cfg.SOLVER.GAMMA),
    ]

    current_epoch = -1
    if args.resume:
        print('Loading from {} ...'.format(args.resume))
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
        if 'current_epoch' in checkpoint:
            current_epoch = int(checkpoint['current_epoch'])
        # if 'discriminator' in checkpoint:
        #     dis_model_without_ddp.load_state_dict(checkpoint['discriminator'])

    work_dir = cfg.WORK_DIR
    if args.test_only:
        evaluation(model,
                   test_loaders,
                   device,
                   types=cfg.TEST.EVAL_TYPES,
                   output_dir=work_dir)
        return

    losses_writer = None
    if dist_utils.is_main_process():
        losses_writer = SummaryWriter(os.path.join(work_dir, 'losses'))
        losses_writer.add_text('config',
                               '{}'.format(str(cfg).replace('\n', '  \n')))
        losses_writer.add_text('args', str(args))

    metrics_writers = {}
    if dist_utils.is_main_process():
        test_dataset_names = [
            loader.dataset.dataset_name for loader in test_loaders
        ]
        for dataset_name in test_dataset_names:
            metrics_writers[dataset_name] = SummaryWriter(
                os.path.join(work_dir, 'metrics', dataset_name))

    start_time = time.time()
    epochs = cfg.SOLVER.EPOCHS
    global total_steps
    start_epoch = current_epoch + 1
    total_steps = (epochs - start_epoch) * len(train_loader)
    print("Start training, total epochs: {} ({} - {}), total steps: {}".format(
        epochs - start_epoch, start_epoch, epochs - 1, total_steps))
    for epoch in range(start_epoch, epochs):
        if args.distributed:
            train_loader.batch_sampler.sampler.set_epoch(epoch)
            target_loader.batch_sampler.sampler.set_epoch(epoch)

        def test_func():
            global best_mAP
            updated = False
            metrics = evaluation(model,
                                 test_loaders,
                                 device,
                                 cfg.TEST.EVAL_TYPES,
                                 output_dir=work_dir,
                                 iteration=global_step)
            if dist_utils.is_main_process() and losses_writer:
                for dataset_name, metric in metrics.items():
                    for k, v in metric.items():
                        metrics_writers[dataset_name].add_scalar(
                            'metrics/' + k, v, global_step=global_step)
                        # if k == 'mAP' and v > best_mAP:
                        if k == 'AP' and v > best_mAP:
                            best_mAP = v
                            updated = True
            model.train()

            return updated

        def save_func(filename=None, save_str=None):
            state_dict = {
                'model': model_without_ddp.state_dict(),
                # 'discriminator': dis_model_without_ddp.state_dict(),
                'current_epoch': epoch,
            }
            filename = filename if filename else 'model_epoch_{:02d}.pth'.format(
                epoch)
            save_path = os.path.join(work_dir, filename)
            dist_utils.save_on_master(state_dict, save_path)
            if dist_utils.is_main_process() and save_str is not None:
                with open(os.path.join(work_dir, 'best.txt'), 'w') as f:
                    f.write(save_str)

            print('Saved to {}'.format(save_path))

        epoch_start = time.time()
        train_one_epoch(model,
                        optimizer,
                        train_loader,
                        target_loader,
                        device,
                        epoch,
                        dis_model=None,
                        dis_optimizer=None,
                        writer=losses_writer,
                        test_func=test_func,
                        save_func=save_func)

        for scheduler in schedulers:
            scheduler.step()

        save_func()

        if epoch == (epochs - 1):
            test_func()

        epoch_cost = time.time() - epoch_start
        left = epochs - epoch - 1
        print('Epoch {} ended, cost {}. Left {} epochs, may cost {}'.format(
            epoch, str(datetime.timedelta(seconds=int(epoch_cost))), left,
            str(datetime.timedelta(seconds=int(left * epoch_cost)))))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Total training time {}'.format(total_time_str))
Пример #5
0
                        type=int,
                        help='number of distributed processes')
    parser.add_argument('--dist-url',
                        default='env://',
                        help='url used to set up distributed training')

    parser.add_argument("opts",
                        help="Modify config options using the command-line",
                        default=None,
                        nargs=argparse.REMAINDER)

    args = parser.parse_args()
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()
    dist_utils.init_distributed_mode(args)

    print(args)
    world_size = dist_utils.get_world_size()
    if world_size != 4:
        lr = cfg.SOLVER.LR * (float(world_size) / 4)
        print('Change lr from {} to {}'.format(cfg.SOLVER.LR, lr))
        cfg.merge_from_list(['SOLVER.LR', lr])

    print(cfg)
    os.makedirs(cfg.WORK_DIR, exist_ok=True)
    if dist_utils.is_main_process():
        with open(os.path.join(cfg.WORK_DIR, 'config.yaml'), 'w') as fid:
            fid.write(str(cfg))
    main(cfg, args)
Пример #6
0
def main(cfg, args):
    train_loader = build_data_loaders(cfg.DATASETS.TRAINS,
                                      transforms=cfg.INPUT.TRANSFORMS_TRAIN,
                                      is_train=True,
                                      distributed=args.distributed,
                                      batch_size=cfg.SOLVER.BATCH_SIZE,
                                      num_workers=cfg.DATALOADER.NUM_WORKERS)
    test_loaders = build_data_loaders(cfg.DATASETS.TESTS,
                                      transforms=cfg.INPUT.TRANSFORMS_TEST,
                                      is_train=False,
                                      distributed=args.distributed,
                                      num_workers=cfg.DATALOADER.NUM_WORKERS)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = build_detectors(cfg)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module

    # optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], cfg.SOLVER.LR, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY)
    optimizer = torch.optim.Adam(
        [p for p in model.parameters() if p.requires_grad],
        cfg.SOLVER.LR,
        betas=(0.9, 0.999),
        weight_decay=cfg.SOLVER.WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     cfg.SOLVER.STEPS,
                                                     gamma=cfg.SOLVER.GAMMA)

    if args.resume:
        print('Loading from {} ...'.format(args.resume))
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])

    work_dir = cfg.WORK_DIR
    if args.test_only:
        evaluation(model,
                   test_loaders,
                   device,
                   types=cfg.TEST.EVAL_TYPES,
                   output_dir=work_dir)
        return

    losses_writer = None
    if dist_utils.is_main_process():
        losses_writer = SummaryWriter(os.path.join(work_dir, 'losses'))
        losses_writer.add_text('config',
                               '{}'.format(str(cfg).replace('\n', '  \n')))
        losses_writer.add_text('args', str(args))

    metrics_writers = {}
    if dist_utils.is_main_process():
        test_dataset_names = [
            loader.dataset.dataset_name for loader in test_loaders
        ]
        for dataset_name in test_dataset_names:
            metrics_writers[dataset_name] = SummaryWriter(
                os.path.join(work_dir, 'metrics', dataset_name))

    print("Start training")
    start_time = time.time()
    epochs = cfg.SOLVER.EPOCHS
    for epoch in range(epochs):
        if args.distributed:
            train_loader.batch_sampler.sampler.set_epoch(epoch)

        epoch_start = time.time()
        train_one_epoch(model,
                        optimizer,
                        train_loader,
                        device,
                        epoch,
                        writer=losses_writer)
        scheduler.step()

        state_dict = {'model': model_without_ddp.state_dict(), 'args': args}
        save_path = os.path.join(work_dir,
                                 'model_epoch_{:02d}.pth'.format(epoch))
        dist_utils.save_on_master(state_dict, save_path)
        print('Saved to {}.'.format(save_path))

        metrics = evaluation(model,
                             test_loaders,
                             device,
                             cfg.TEST.EVAL_TYPES,
                             output_dir=work_dir,
                             iteration=epoch)
        if dist_utils.is_main_process() and losses_writer:
            for dataset_name, metric in metrics.items():
                for k, v in metric.items():
                    metrics_writers[dataset_name].add_scalar(
                        'metrics/' + k, v, global_step=global_step)

        epoch_cost = time.time() - epoch_start
        left = epochs - epoch - 1
        print('Epoch {} ended, cost {}. Left {} epochs, may cost {}'.format(
            epoch, str(datetime.timedelta(seconds=int(epoch_cost))), left,
            str(datetime.timedelta(seconds=int(left * epoch_cost)))))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Total training time {}'.format(total_time_str))
Пример #7
0
def do_evaluation(model, data_loader, device, types, output_dir, iteration=None, viz=False):
    model.eval()
    metric_logger = utils.MetricLogger(delimiter="  ")
    dataset = data_loader.dataset
    header = 'Testing {}:'.format(dataset.dataset_name)
    results_dict = {}
    has_mask = False
    for images, img_metas, targets in metric_logger.log_every(data_loader, 10, header):
        assert len(targets) == 1
        images = images.to(device)

        model_time = time.time()
        det = model(images, img_metas)[0]
        boxes, scores, labels = det['boxes'], det['scores'], det['labels']

        model_time = time.time() - model_time

        img_meta = img_metas[0]
        scale_factor = img_meta['scale_factor']
        img_info = img_meta['img_info']

        if viz:
            import matplotlib.pyplot as plt
            import matplotlib.patches as patches
            plt.switch_backend('TKAgg')
            image = de_normalize(images[0], img_meta)
            plt.subplot(122)
            plt.imshow(image)
            plt.title('Predict')
            for i, ((x1, y1, x2, y2), label) in enumerate(zip(boxes.tolist(), labels.tolist())):
                if scores[i] > 0.65:
                    rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, facecolor='none', edgecolor='g')
                    category_id = dataset.label2cat[label]
                    plt.text(x1, y1, '{}:{:.2f}'.format(dataset.CLASSES[category_id], scores[i]), color='r')
                    plt.gca().add_patch(rect)

            plt.subplot(121)
            plt.imshow(image)
            plt.title('GT')
            for i, ((x1, y1, x2, y2), label) in enumerate(zip(targets[0]['boxes'].tolist(), targets[0]['labels'].tolist())):
                category_id = dataset.label2cat[label]
                rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, facecolor='none', edgecolor='g')
                plt.text(x1, y1, '{}'.format(dataset.CLASSES[category_id]))
                plt.gca().add_patch(rect)
            plt.show()

        boxes /= scale_factor
        result = {}

        if 'masks' in det:
            has_mask = True
            (w, h) = img_meta['origin_img_shape']
            masks = paste_masks_in_image(det['masks'], boxes, (h, w))
            rles = []
            for mask in masks.cpu().numpy():
                mask = mask >= 0.5
                mask = mask_util.encode(np.array(mask[0][:, :, None], order='F', dtype='uint8'))[0]
                # "counts" is an array encoded by mask_util as a byte-stream. Python3's
                # json writer which always produces strings cannot serialize a bytestream
                # unless you decode it. Thankfully, utf-8 works out (which is also what
                # the pycocotools/_mask.pyx does).
                mask['counts'] = mask['counts'].decode('utf-8')
                rles.append(mask)
            result['masks'] = rles

        boxes = boxes.tolist()
        labels = labels.tolist()
        labels = [dataset.label2cat[label] for label in labels]
        scores = scores.tolist()

        result['boxes'] = boxes
        result['scores'] = scores
        result['labels'] = labels

        # save_visualization(dataset, img_meta, result, output_dir)

        results_dict.update({
            img_info['id']: result
        })
        metric_logger.update(model_time=model_time)

    if get_world_size() > 1:
        dist.barrier()

    predictions = _accumulate_predictions_from_multiple_gpus(results_dict)
    if not is_main_process():
        return {}
    results = {}
    if has_mask:
        result = coco_evaluation(dataset, predictions, output_dir, iteration=iteration)
        results.update(result)
    if 'voc' in types:
        result = voc_evaluation(dataset, predictions, output_dir, iteration=iteration, use_07_metric=False)
        results.update(result)
    return results