示例#1
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(
                args.resume, map_location='cpu', check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])

    # standard PyTorch mean-std input image normalization
    transform = T.Compose([
        T.Resize(800),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    pimg = Image.open(args.source).convert('RGB')

    t0 = time.time()
    scores, boxes = detect(pimg, model, transform, device)
    print(f'inference use {time.time() - t0} seconds.')
    plot_results(pimg, scores, boxes)
示例#2
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    # if args.distributed:
    #     sampler_train = DistributedSampler(dataset_train)
    #     sampler_val = DistributedSampler(dataset_val, shuffle=False)
    # else:
    #     sampler_train = torch.utils.data.RandomSampler(dataset_train)
    #     sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    # batch_sampler_train = torch.utils.data.BatchSampler(
    #     sampler_train, args.batch_size, drop_last=True)

    sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    # data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
    #                                collate_fn=utils.collate_fn, num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn_leimao,
                                 num_workers=args.num_workers)

    for inputs, labels in data_loader_val:
        print("---------------------")
        print(inputs.shape)
        print(labels)
示例#3
0
def load_model(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # IPython.embed()
    # IPython.embed()
    os.system("sudo chmod -R 777 /home/shuxuang/.cache/")
    model, criterion, postprocessors = build_model(
        args)  # use the same model as detr paper on coco
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
    #                               weight_decay=args.weight_decay)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)
    # modify the dataset from coco to nvdata
    # home_dir = os.environ["HOME"]
    # # on local
    # dataset_train_ = build_nvdataset(dataset_root=[
    #                                     os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'),
    #                                     os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')],
    #                                 mode='train')
    dataset_val = build_nvdataset(dataset_root=[
        os.path.join(os.environ["HOME"], 'datasets/test'),
        os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')
    ],
                                  mode='test')
    # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy'))
    # # on maglev
    # dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql,  args.dataset_root_img],
    #                                  mode='train')
    # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql],
    #                               mode='test')
    # indices_50k =np.load(os.path.join(args.root_indices))

    # dataset_train = Subset(dataset_train_, indices_50k)
    print("Validation samples: %d" % (len(dataset_val)))
    # IPython.embed()

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth')
    args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth'
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            print('Loading model: %s' % args.resume)
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])

    # if args.eval:
    #     evaluate(model, dataset_val, postprocessors, device)
    return model, dataset_val, postprocessors, device


# if __name__ == '__main__':
#     parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
#     args = parser.parse_args()
#     if args.output_dir:
#         Path(args.output_dir).mkdir(parents=True, exist_ok=True)
#     main(args)

# CUDA_VISIBLE_DEVICES=1 dazel run //sandbox/williamz/detr:train_with_eval -- --eval

# test:
# CUDA_VISIBLE_DEVICES=1 dazel run //sandbox/williamz/detr:eval -- --eval --resume /home/shuxuang/experiments/train_output/checkpoint.pth

# get info:
# maglev workflows get 0cf25940-3f00-5c2c-a8e8-1571e986513b
# maglev volumes mount -n train-outputs  -v 4977fea-0998-4d5b-b557-ff17605f2098 -p /home/shuxuang/experiments/train_output
示例#4
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    # Save our Wandb metadata
    if not args.no_wb:
        wandb.init(entity='dl-project',
                   project='dl-final-project',
                   name=args.wb_name,
                   notes=args.wb_notes,
                   reinit=True)
    wandb.config.epochs = args.epochs

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    # visualize_video(model, postprocessors)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of trainable params:', n_parameters)
    wandb.config.n_parameters = n_parameters
    wandb.config.n_trainable_parameters = n_parameters  # better name

    # Log total # of model parameters (including frozen) to W&B
    n_total_parameters = sum(p.numel() for p in model.parameters())
    print('total number of parameters:', n_total_parameters)
    wandb.config.n_total_parameters = n_total_parameters

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    # For visualization we want the raw images without any normalization or random resizing
    dataset_val_without_resize = CocoDetection(
        "data/coco/val2017",
        annFile="data/coco/annotations/instances_val2017.json",
        transforms=T.Compose([T.ToTensor()]))

    # Save metadata about training + val datasets and batch size
    wandb.config.len_dataset_train = len(dataset_train)
    wandb.config.len_dataset_val = len(dataset_val)
    wandb.config.batch_size = args.batch_size

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val,
                                                          shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val,
                                                      shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers,
                                 pin_memory=True)

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    param_dicts = [{
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if not match_name_keywords(n, args.lr_backbone_names)
            and not match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters() if
            match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
        ],
        "lr":
        args.lr_backbone,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr * args.lr_linear_proj_mult,
    }]

    # Not sure if we should save all hyperparameters in wandb.config?
    # just start with a few important ones
    wandb.config.lr = args.lr
    wandb.config.lr_backbone = args.lr_backbone

    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts,
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            import copy
            p_groups = copy.deepcopy(optimizer.param_groups)
            optimizer.load_state_dict(checkpoint['optimizer'])
            for pg, pg_old in zip(optimizer.param_groups, p_groups):
                pg['lr'] = pg_old['lr']
                pg['initial_lr'] = pg_old['initial_lr']
            # print(optimizer.param_groups)
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
            args.override_resumed_lr_drop = True
            if args.override_resumed_lr_drop:
                print(
                    'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.'
                )
                lr_scheduler.step_size = args.lr_drop
                lr_scheduler.base_lrs = list(
                    map(lambda group: group['initial_lr'],
                        optimizer.param_groups))
            lr_scheduler.step(lr_scheduler.last_epoch)
            args.start_epoch = checkpoint['epoch'] + 1
        # check the resumed model
        if not args.eval:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)

    if args.eval:

        print("Generating visualizations...")
        visualize_bbox(model, postprocessors, data_loader_val, device,
                       dataset_val_without_resize)
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_file_for_wb = str(
                output_dir / f'{wandb.run.id}_checkpoint{epoch:04}.pth')
            checkpoint_paths = [
                output_dir / 'checkpoint.pth', checkpoint_file_for_wb
            ]

            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

            # Save model checkpoint to W&B
            wandb.save(checkpoint_file_for_wb)

        # Generate visualizations for fixed(?) set of images every epoch
        print("Generating visualizations...")
        visualize_bbox(model, postprocessors, data_loader_val, device,
                       dataset_val_without_resize)
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        # Save the COCO metrics properly
        metric_name = [
            "AP", "AP50", "AP75", "APs", "APm", "APl", "AR@1", "AR@10",
            "AR@100", "ARs", "ARm", "ARl"
        ]
        for i, metric_val in enumerate(log_stats["test_coco_eval_bbox"]):
            log_stats[metric_name[i]] = metric_val

        if not args.no_wb:
            wandb.log(log_stats)
        print("train_loss: ", log_stats['train_loss'])

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")
            wandb.save(str(output_dir / "log.txt"))

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    eval_filename_for_wb = f'{wandb.run.id}_eval_{epoch:04}.pth'
                    eval_path_for_wb = str(output_dir / "eval" /
                                           eval_filename_for_wb)
                    filenames = ['latest.pth', eval_filename_for_wb]
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

                    # TODO not sure if this file will end up being too big
                    # I think it's the COCO precision/recall metrics
                    # in some format...
                    # let's track it just in case to start!
                    wandb.save(eval_path_for_wb)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
def main(args):
    utils.init_distributed_mode(args)
    if args.output_dir is None:
        args.output_dir = os.path.expanduser(
            '~/Data/AI2Thor_detection_features/')
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_all = build_dataset(image_set='all', args=args)

    if args.distributed:
        sampler_all = DistributedSampler(dataset_all, shuffle=False)
    else:
        sampler_all = torch.utils.data.SequentialSampler(dataset_all)

    data_loader_all = DataLoader(dataset_all,
                                 args.batch_size,
                                 sampler=sampler_all,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    base_ds = get_coco_api_from_dataset(dataset_all)

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start extracting features")
    start_time = time.time()
    extract_feature(model, criterion, postprocessors, data_loader_all, base_ds,
                    device, args.output_dir)
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Extracting features time {}'.format(total_time_str))

    print('Start combining files')
    start_time = time.time()
    data_dir = os.path.expanduser('~/Data/AI2Thor_offline_data_2.0.2/')
    combine_files(args, data_dir)
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Combining files time {}'.format(total_time_str))
示例#6
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print(args)

    device = torch.device(args.device)

    # Fix the seed for reproducibility.
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module

    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
            "lr": args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True)
    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn, num_workers=args.num_workers)

    # Load from pretrained DETR model.
    assert args.num_queries == 100, args.num_queries
    assert args.enc_layers == 6 and args.dec_layers == 6
    assert args.backbone in ['resnet50', 'resnet101', 'swin'], args.backbone
    if args.backbone == 'resnet50':
        pretrain_model = './data/detr_coco/detr-r50-e632da11.pth'
    elif args.backbone == 'resnet101':
        pretrain_model = './data/detr_coco/detr-r101-2c7b67e5.pth'
    else:
        pretrain_model = None
    if pretrain_model is not None:
        pretrain_dict = torch.load(pretrain_model, map_location='cpu')['model']
        my_model_dict = model_without_ddp.state_dict()
        pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in my_model_dict}
        my_model_dict.update(pretrain_dict)
        model_without_ddp.load_state_dict(my_model_dict)

    output_dir = Path(args.output_dir)
    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(
            model, criterion, data_loader_train, optimizer, device, epoch,
            args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 10 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
            if (epoch + 1) > args.lr_drop and (epoch + 1) % 10 == 0:
                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master({
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     'epoch': epoch,
                     'n_parameters': n_parameters}

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#7
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # IPython.embed()
    # IPython.embed()
    # os.system("sudo chmod -R 777 /home/shuxuang/.cache/")
    model, criterion, postprocessors = build_model(
        args)  # use the same model as detr paper on coco
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)
    # modify the dataset from coco to nvdata
    # home_dir = os.environ["HOME"]
    # dataset_train_ = build_nvdataset(dataset_root=[
    #                                     os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'),
    #                                     os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')],
    #                                 mode='train')
    # dataset_val = build_nvdataset(dataset_root=[
    #                                 os.path.join(os.environ["HOME"],'datasets/test'),
    #                                 os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')],
    #                               mode='test')
    # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy'))

    dataset_train = build_nvdataset(
        dataset_root=[args.dataset_root_sql, args.dataset_root_img],
        mode='train',
        camera=args.camera)
    dataset_val = build_nvdataset(
        dataset_root=[args.dataset_root_test, args.dataset_root_test],
        mode='test',
        camera=args.camera)
    if args.root_indices is not None:
        indices_50k = np.load(os.path.join(args.root_indices))
        # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy'))
        dataset_train = Subset(dataset_train, indices_50k)
    # IPython.embed()
    print("Train samples: %d" % (len(dataset_train)))

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    # if args.dataset_file == "coco_panoptic":
    #     # We also evaluate AP during panoptic training, on original coco DS
    #     coco_val = datasets.coco.build("val", args)
    #     base_ds = get_coco_api_from_dataset(coco_val)
    # elif args.dataset_file == "nvdata":
    #     coco_val = datasets.coco.build("val", args)
    #     base_ds = get_coco_api_from_dataset(coco_val)
    # else:
    #     base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    # if args.eval:
    #     test_stats, coco_evaluator = evaluate_nvdata(model, criterion, postprocessors,
    #                                           data_loader_val, base_ds, device, args.output_dir)
    #     if args.output_dir:
    #         utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
    #     return

    # if args.eval:
    #     evaluate(model, dataset_val, postprocessors, device)

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 50 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        # test_stats, coco_evaluator = evaluate_nvdata(
        #     model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir
        # )

        # log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
        #              **{f'test_{k}': v for k, v in test_stats.items()},
        #              'epoch': epoch,
        #              'n_parameters': n_parameters}

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            # if coco_evaluator is not None:
            #     (output_dir / 'eval').mkdir(exist_ok=True)
            #     if "bbox" in coco_evaluator.coco_eval:
            #         filenames = ['latest.pth']
            #         if epoch % 50 == 0:
            #             filenames.append(f'{epoch:03}.pth')
            #         for name in filenames:
            #             torch.save(coco_evaluator.coco_eval["bbox"].eval,
            #                        output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#8
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    wandb.init(project="qpic-project",
               entity="sangbaeklee",
               group="experiment_qpic")
    wandb.config = {
        "learning_rate": args.lr,
        "epochs": args.epochs,
        "batch_size": args.batch_size,
    }

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    wandb.watch(model)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if not args.hoi:
        if args.dataset_file == "coco_panoptic":
            # We also evaluate AP during panoptic training, on original coco DS
            coco_val = datasets.coco.build("val", args)
            base_ds = get_coco_api_from_dataset(coco_val)
        else:
            base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
    elif args.pretrained:
        checkpoint = torch.load(args.pretrained, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)

    if args.eval:
        if args.hoi:
            test_stats = evaluate_hoi(args.dataset_file, model, postprocessors,
                                      data_loader_val,
                                      args.subject_category_id, device)
            return
        else:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)
            if args.output_dir:
                utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                     output_dir / "eval.pth")
            return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        if args.hoi:
            test_stats = evaluate_hoi(args.dataset_file, model, postprocessors,
                                      data_loader_val,
                                      args.subject_category_id, device)
            coco_evaluator = None
        else:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        #import pdb; pdb.set_trace()
        if args.dataset_file == 'hico':
            wandb.log({
                "loss": train_stats['loss'],
                "mAP": test_stats['mAP'],
                "mAP rare": test_stats['mAP rare'],
                "mAP non-rare": test_stats['mAP non-rare'],
                "mean max recall": test_stats['mean max recall']
            })
        elif args.dataset_file == 'vcoco':
            wandb.log({
                "mAP_all": test_stats['mAP_all'],
                "mAP_thesis": test_stats['mAP_thesis'],
                "AP_hold_obj": test_stats['AP_hold_obj'],
                "AP_stand": test_stats['AP_stand'],
                "AP_sit_instr": test_stats['AP_sit_instr'],
                "AP_ride_instr": test_stats['AP_ride_instr'],
                "AP_walk": test_stats['AP_walk'],
                "AP_look_obj": test_stats['AP_look_obj'],
                "AP_hit_instr": test_stats['AP_hit_instr'],
                "AP_hit_obj": test_stats['AP_hit_obj'],
                "AP_eat_obj": test_stats['AP_eat_obj'],
                "AP_eat_instr": test_stats['AP_eat_instr'],
                "AP_jump_instr": test_stats['AP_jump_instr'],
                "AP_lay_instr": test_stats['AP_lay_instr'],
                "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'],
                "AP_carry_obj": test_stats['AP_carry_obj'],
                "AP_throw_obj": test_stats['AP_throw_obj'],
                "AP_catch_obj": test_stats['AP_catch_obj'],
                "AP_cut_instr": test_stats['AP_cut_instr'],
                "AP_cut_obj": test_stats['AP_cut_obj'],
                "AP_run": test_stats['AP_run'],
                "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'],
                "AP_ski_instr": test_stats['AP_ski_instr'],
                "AP_surf_instr": test_stats['AP_surf_instr'],
                "AP_skateboard_instr": test_stats['AP_skateboard_instr'],
                "AP_smile": test_stats['AP_smile'],
                "AP_drink_instr": test_stats['AP_drink_instr'],
                "AP_kick_obj": test_stats['AP_kick_obj'],
                "AP_point_instr": test_stats['AP_point_instr'],
                "AP_read_obj": test_stats['AP_read_obj'],
                "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\
                "loss" : train_stats['loss']
            })
        else:
            continue

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#9
0
文件: main.py 项目: wx-b/VisTR
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # no validation ground truth for ytvos dataset
    dataset_train = build_dataset(image_set='train', args=args)
    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)

    output_dir = Path(args.output_dir)

    # load coco pretrained weight
    checkpoint = torch.load(args.pretrained_weights,
                            map_location='cpu')['model']
    del checkpoint["vistr.class_embed.weight"]
    del checkpoint["vistr.class_embed.bias"]
    del checkpoint["vistr.query_embed.weight"]
    model.module.load_state_dict(checkpoint, strict=False)

    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#10
0
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    image_set = 'fewshot' if args.fewshot_finetune else 'train'
    dataset_train = build_dataset(image_set=image_set, args=args)
    dataset_val = build_dataset(image_set='val', args=args)
    dataset_support = build_support_dataset(image_set=image_set, args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val,
                                                          shuffle=False)
            sampler_support = samplers.NodeDistributedSampler(dataset_support)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val,
                                                      shuffle=False)
            sampler_support = samplers.DistributedSampler(dataset_support)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
        sampler_support = torch.utils.data.RandomSampler(dataset_support)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=False)

    loader_train = DataLoader(dataset_train,
                              batch_sampler=batch_sampler_train,
                              collate_fn=utils.collate_fn,
                              num_workers=args.num_workers,
                              pin_memory=True)

    loader_val = DataLoader(dataset_val,
                            batch_size=args.batch_size,
                            sampler=sampler_val,
                            drop_last=False,
                            collate_fn=utils.collate_fn,
                            num_workers=args.num_workers,
                            pin_memory=True)

    loader_support = DataLoader(dataset_support,
                                batch_size=1,
                                sampler=sampler_support,
                                drop_last=False,
                                num_workers=args.num_workers,
                                pin_memory=False)

    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    if not args.fewshot_finetune:
        param_dicts = [{
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if not match_name_keywords(n, args.lr_backbone_names)
                and not match_name_keywords(n, args.lr_linear_proj_names)
                and p.requires_grad
            ],
            "lr":
            args.lr,
            "initial_lr":
            args.lr,
        }, {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if match_name_keywords(n, args.lr_backbone_names)
                and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
            "initial_lr":
            args.lr_backbone,
        }, {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if match_name_keywords(n, args.lr_linear_proj_names)
                and p.requires_grad
            ],
            "lr":
            args.lr * args.lr_linear_proj_mult,
            "initial_lr":
            args.lr * args.lr_linear_proj_mult,
        }]
    else:
        # For few-shot finetune stage, do not train sampling offsets, reference points, and embedding related parameters
        param_dicts = [
            {
                "params":
                    [p for n, p in model_without_ddp.named_parameters()
                     if not match_name_keywords(n, args.lr_backbone_names) and \
                        not match_name_keywords(n, args.lr_linear_proj_names) and \
                        not match_name_keywords(n, args.embedding_related_names) and p.requires_grad],
                "lr": args.lr,
                "initial_lr": args.lr,
            },
            {
                "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
                "lr": args.lr_backbone,
                "initial_lr": args.lr_backbone,
            },
        ]

    optimizer = torch.optim.AdamW(param_dicts, weight_decay=args.weight_decay)
    lr_scheduler = WarmupMultiStepLR(optimizer,
                                     args.lr_drop_milestones,
                                     gamma=0.1,
                                     warmup_epochs=args.warmup_epochs,
                                     warmup_factor=args.warmup_factor,
                                     warmup_method='linear',
                                     last_epoch=args.start_epoch - 1)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.dataset.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))

        if args.fewshot_finetune:
            if args.category_codes_cls_loss:
                # Re-init weights of novel categories for few-shot finetune
                novel_class_ids = datasets.get_class_ids(args.dataset_file,
                                                         type='novel')
                if args.num_feature_levels == 1:
                    for novel_class_id in novel_class_ids:
                        nn.init.normal_(model_without_ddp.category_codes_cls.L.
                                        weight[novel_class_id])
                elif args.num_feature_levels > 1:
                    for classifier in model_without_ddp.category_codes_cls:
                        for novel_class_id in novel_class_ids:
                            nn.init.normal_(
                                classifier.L.weight[novel_class_id])
                else:
                    raise RuntimeError

    if args.eval:
        # Evaluate only base categories
        test_stats, coco_evaluator = evaluate(args,
                                              model,
                                              criterion,
                                              postprocessors,
                                              loader_val,
                                              loader_support,
                                              base_ds,
                                              device,
                                              type='base')
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval_base.pth")

        # Evaluate only novel categories
        test_stats, coco_evaluator = evaluate(args,
                                              model,
                                              criterion,
                                              postprocessors,
                                              loader_val,
                                              loader_support,
                                              base_ds,
                                              device,
                                              type='novel')
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval_novel.pth")

        return

    print("Start training...")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(args, model, criterion, loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        # Saving Checkpoints after each epoch
        if args.output_dir and (not args.fewshot_finetune):
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        # Saving Checkpoints every args.save_every_epoch epoch(s)
        if args.output_dir:
            checkpoint_paths = []
            if (epoch + 1) % args.save_every_epoch == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        # Evaluation and Logging
        if (epoch + 1) % args.eval_every_epoch == 0:
            if 'base' in args.dataset_file:
                evaltype = 'base'
            else:
                evaltype = 'all'
            if args.fewshot_finetune:
                evaltype = 'novel'

            test_stats, coco_evaluator = evaluate(args,
                                                  model,
                                                  criterion,
                                                  postprocessors,
                                                  loader_val,
                                                  loader_support,
                                                  base_ds,
                                                  device,
                                                  type=evaltype)

            log_stats = {
                **{f'train_{k}': v
                   for k, v in train_stats.items()},
                **{f'test_{k}': v
                   for k, v in test_stats.items()}, 'epoch': epoch,
                'n_parameters': n_parameters,
                'evaltype': evaltype
            }

            if args.output_dir and utils.is_main_process():
                with (output_dir / "results.txt").open("a") as f:
                    f.write(json.dumps(log_stats) + "\n")
                # for evaluation logs
                if coco_evaluator is not None:
                    (output_dir / 'eval').mkdir(exist_ok=True)
                    if "bbox" in coco_evaluator.coco_eval:
                        filenames = ['latest.pth']
                        filenames.append(f'{epoch:03}.pth')
                        for name in filenames:
                            torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                       output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#11
0
文件: main.py 项目: mlpc-ucsd/LETR
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    output_dir = Path(args.output_dir)

    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
            "lr": args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.eval:
        dataset_val = build_dataset(image_set=args.dataset, args=args)

        if args.distributed:
            sampler_val = DistributedSampler(dataset_val, shuffle=False)
        else:
            sampler_val = torch.utils.data.SequentialSampler(dataset_val)

        data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)
    else:
        dataset_train = build_dataset(image_set='train', args=args)
        dataset_val = build_dataset(image_set='val', args=args)

        if args.distributed:
            sampler_train = DistributedSampler(dataset_train)
            sampler_val = DistributedSampler(dataset_val, shuffle=False)
        else:
            sampler_train = torch.utils.data.SequentialSampler(dataset_train)
            sampler_val = torch.utils.data.SequentialSampler(dataset_val)

        batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True)

        data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers)
        data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)

    base_ds = get_coco_api_from_dataset(dataset_val)



    if args.resume and args.frozen_weights:
        assert False
    elif args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True)
            new_state_dict = {}
            for k in checkpoint['model']:
                if ("class_embed" in k) or ("bbox_embed" in k) or ("query_embed" in k):
                    continue
                if  ("input_proj" in k) and args.layer1_num != 3:
                    continue
                new_state_dict[k] = checkpoint['model'][k]
            
            # Compare load model and current model
            current_param = [n for n,p in model_without_ddp.named_parameters()]
            current_buffer = [n for n,p in model_without_ddp.named_buffers()]
            load_param = new_state_dict.keys()
            for p in load_param:
                if p not in current_param and p not in current_buffer:
                    print(p, 'NOT appear in current model.  ')
            for p in current_param:
                if p not in load_param:
                    print(p, 'NEW parameter.  ')
            model_without_ddp.load_state_dict(new_state_dict, strict=False)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')

            # this is to compromise old implementation 
            new_state_dict = {}
            for k in checkpoint['model']:
                if "bbox_embed" in k:
                    print("bbox_embed from OLD implementation has been replaced with lines_embed")
                    new_state_dict["lines_embed."+'.'.join(k.split('.')[1:])] = checkpoint['model'][k] 
                else:
                    new_state_dict[k] = checkpoint['model'][k]

            # compare resume model and current model
            current_param = [n for n,p in model_without_ddp.named_parameters()]
            current_buffer = [n for n,p in model_without_ddp.named_buffers()]
            load_param = new_state_dict.keys()
            #for p in load_param:
                #if p not in current_param and p not in current_buffer:
                    #print(p, 'not been loaded to current model. Strict == False?')
            for p in current_param:
                if p not in load_param:
                    print(p, 'is a new parameter. Not found from load dict.')
            
            # load model
            model_without_ddp.load_state_dict(new_state_dict)

            # load optimizer
            if not args.no_opt and not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
                optimizer.load_state_dict(checkpoint['optimizer'])
                checkpoint['lr_scheduler']['step_size'] = args.lr_drop  # change the lr_drop epoch
                lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
                args.start_epoch = checkpoint['epoch'] + 1
    elif args.frozen_weights:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        new_state_dict = {}
        for k in checkpoint['model']:
            if "bbox_embed" in k:
                new_state_dict["lines_embed."+'.'.join(k.split('.')[1:])] = checkpoint['model'][k] 
            else:
                new_state_dict[k] = checkpoint['model'][k]

        model_without_ddp.letr.load_state_dict(new_state_dict)

        # params
        encoder = {k:v for k,v in new_state_dict.items() if "encoder" in k}
        decoder = {k:v for k,v in new_state_dict.items() if "decoder" in k}
        class_embed = {k:v for k,v in new_state_dict.items() if "class_embed" in k}
        line_embed = {k:v for k,v in new_state_dict.items() if "lines_embed" in k}

        model_without_ddp.load_state_dict(encoder, strict=False)
        model_without_ddp.load_state_dict(decoder, strict=False)
        model_without_ddp.load_state_dict(class_embed, strict=False)
        model_without_ddp.load_state_dict(line_embed, strict=False)
        print('Finish load frozen_weights')
    else:
        print("NO RESUME. TRAIN FROM SCRATCH")
        
    if args.eval:
        test_stats = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, args)
        #print('checkpoint'+ str(checkpoint['epoch']))
        return 

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        
        train_stats = train_one_epoch(model, criterion, postprocessors, data_loader_train, optimizer, device, epoch, args.clip_max_norm, args)
        
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoints/checkpoint.pth']
            # extra checkpoint before LR drop and every several epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % args.save_freq == 0:
                checkpoint_paths.append(output_dir / f'checkpoints/checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master({
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        test_stats = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, args)

        log_stats = {**{f'train_{k}': format(v, ".6f") for k, v in train_stats.items()},
                     **{f'test_{k}': format(v, ".6f") for k, v in test_stats.items()},
                     'epoch': epoch, 'n_parameters': n_parameters}

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#12
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    checkpoint = torch.load(args.resume, map_location='cpu')
    model.load_state_dict(checkpoint['model'], strict=False)
    if torch.cuda.is_available():
        model.cuda()
    model.eval()

    for img_file in os.listdir(args.imgs_dir):
        t0 = time.time()
        img_path = os.path.join(args.imgs_dir, img_file)
        out_imgName = './visualize/' + 'out_' + img_file[:-4] + '.png'
        im = Image.open(img_path)
        # mean-std normalize the input image (batch-size: 1)
        img = transform(im).unsqueeze(0)

        img = img.cuda()
        # propagate through the model
        outputs = model(img)

        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']

        prob = out_logits.sigmoid()
        topk_values, topk_indexes = torch.topk(prob.view(
            out_logits.shape[0], -1),
                                               100,
                                               dim=1)
        scores = topk_values
        topk_boxes = topk_indexes // out_logits.shape[2]
        labels = topk_indexes % out_logits.shape[2]
        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
        boxes = torch.gather(boxes, 1,
                             topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

        keep = scores[0] > 0.2
        boxes = boxes[0, keep]
        labels = labels[0, keep]

        # and from relative [0, 1] to absolute [0, height] coordinates
        im_h, im_w = im.size
        #print('im_h,im_w',im_h,im_w)
        target_sizes = torch.tensor([[im_w, im_h]])
        target_sizes = target_sizes.cuda()
        img_h, img_w = target_sizes.unbind(1)
        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
        boxes = boxes * scale_fct[:, None, :]
        print(time.time() - t0)
        #plot_results
        source_img = Image.open(img_path).convert("RGBA")

        draw = ImageDraw.Draw(source_img)

        print("Boxes", boxes, boxes.tolist())
        for xmin, ymin, xmax, ymax in boxes[0].tolist():
            draw.rectangle(((xmin, ymin), (xmax, ymax)), outline="red")

        source_img.save(out_imgName, "png")
示例#13
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    checkpoint = torch.load(args.resume, map_location='cpu')
    model.load_state_dict(checkpoint['model'], strict=False)
    if torch.cuda.is_available():
        model.cuda()
    model.eval()

    t0 = time.time()
    im = Image.open(args.img_path)
    print(im)
    # mean-std normalize the input image (batch-size: 1)
    img = transform(im).unsqueeze(0)
    #print('img shape = ', img[0].shape)

    img=img.cuda()
    #print(img[0].cpu().numpy())

    import cv2 

    # cv2.imwrite('/content/content/content/Deformable-DETR/explained /sample before forward/tensor0.png', img[0][0].cpu().numpy(), )
    # cv2.imwrite( '/content/content/content/Deformable-DETR/explained /sample before forward/tensor1.png', img[0][1].cpu().numpy(),)
    # cv2.imwrite( '/content/content/content/Deformable-DETR/explained /sample before forward/tensor2.png', img[0][2].cpu().numpy(),)
    # cv2.imwrite( '/content/content/content/Deformable-DETR/explained /sample before forward/tensor3.png', img[0].cpu().numpy(),)



    # propagate through the model
    outputs = model(img)

    print ('-----------------------------------------------------------------------------')





    # mean-std normalize the input image (batch-size: 1)
    img = transform(im).unsqueeze(0)

    img=img.cuda()

    # propagate through the model
    outputs = model(img)

    # keep only predictions with 0.7+ confidence
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.6

    # convert boxes from [0; 1] to image scales
    bboxes_scaled = V.rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)

    V.plot_results(im, probas[keep], bboxes_scaled, '/content/Explain-Deformable-DETR/explained /output/1.png')

    print ('keep.nonzero()', keep)
    # use lists to store the outputs via up-values
    conv_features, enc_attn_weights, dec_attn_weights = [], [], []

    hooks = [
        model.backbone[-2].register_forward_hook(
            lambda self, input, output: conv_features.append(output)
        ),
        model.transformer.encoder.layers[-1].self_attn.register_forward_hook(
            lambda self, input, output: enc_attn_weights.append(output[0])
        ),
        model.transformer.decoder.layers[-1].self_attn.register_forward_hook(
            lambda self, input, output: dec_attn_weights.append(output[0])
        ),
    ]

    # propagate through the model
    outputs = model(img)

    for hook in hooks:
        hook.remove()

    # don't need the list anymore
    conv_features = conv_features[0]
    enc_attn_weights = enc_attn_weights[0]
    dec_attn_weights = dec_attn_weights[0]

    h, w = conv_features['0'].tensors.shape[-2:]

    print ('(dec_attn_weights[0, idx].view(h, w))', (dec_attn_weights.transpose(0, 1).shape))

    dec_attn_weights = dec_attn_weights.transpose(0, 1).cpu()
    print ('bboxes_scaled', bboxes_scaled.shape)


    fig, axs = plt.subplots(ncols=len(bboxes_scaled), nrows=2, figsize=(22, 7))
    colors = V.COLORS * 100
    for idx, ax_i, (xmin, ymin, xmax, ymax) in zip(keep.nonzero(), axs.T, bboxes_scaled):
        ax = ax_i[0]
        ax.imshow(dec_attn_weights[0, idx].view(16, 16))
        ax.axis('off')
        ax.set_title(f'query id: {idx.item()}')
        ax = ax_i[1]
        ax.imshow(im)
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                  fill=False, color='blue', linewidth=3))
        ax.axis('off')
        ax.set_title(V.CLASSES[probas[idx].argmax()])
    fig.tight_layout()
    

    fig.savefig('/content/Explain-Deformable-DETR/explained /encoder feature map/1.png')


    # print ('model.transformer.encoder.layers[-1].self_attn', model.transformer.encoder.layers[-1].self_attn)
    # print ('enc_attn_weights.shape', enc_attn_weights.shape)



    # model.transformer.encoder.layers[-1].self_attn.register_forward_hook(printnorm)

    # outputs = model(img)




    print ('-----------------------------------------------------------------------------')

    out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']

    prob = out_logits.sigmoid()
    topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
    scores = topk_values
    topk_boxes = topk_indexes // out_logits.shape[2]
    labels = topk_indexes % out_logits.shape[2]
    boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
    boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))

    keep = scores[0] > 0.35
    boxes = boxes[0, keep]
    labels = labels[0, keep]

    # and from relative [0, 1] to absolute [0, height] coordinates
    im_h,im_w = im.size
    #print('im_h,im_w',im_h,im_w)
    target_sizes =torch.tensor([[im_w,im_h]])
    target_sizes =target_sizes.cuda()
    img_h, img_w = target_sizes.unbind(1)
    scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
    boxes = boxes * scale_fct[:, None, :]
    print(time.time()-t0)
    #plot_results
    source_img = Image.open(args.img_path).convert("RGBA")

    draw = ImageDraw.Draw(source_img)
    
    print("Boxes",boxes,boxes.tolist())
    for xmin, ymin, xmax, ymax in boxes[0].tolist():
        draw.rectangle(((xmin, ymin), (xmax, ymax)), outline ="red")

    source_img.save('test.png', "png")
    results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
示例#14
0
    output_dir = "outputs"
    device = "cuda"
    seed = 42
    start_epoch = 0
    eval = False
    num_workers = 2

    # distributed
    distributed = False
    world_size = 1
    dist_url = "env://"


# %%
args = Args()
utils.init_distributed_mode(args)

device = torch.device(args.device)

# fix the seed for reproducibility
seed = args.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# %% PREPARE DATA
INSTRE_PATH = "/datadrive/crr/datasets/instre"
TRAIN_PATH = INSTRE_PATH + "/INSTRE-S-TRAIN"
TEST_PATH = INSTRE_PATH + "/INSTRE-S-TEST"

class_dirs_train = glob.glob(os.path.join(TRAIN_PATH, "*/"))
def main(args):
    utils.init_distributed_mode(args)

    print("args: {}".format(args))

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # special process to control whether freeze backbone
    args.model.train_backbone = args.lr_backbone > 0

    model, criterion, postprocessors = build_model(args.model)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   args.lr_drop,
                                                   gamma=args.lr_gamma)

    dataset_train = build_dataset(
        image_set='train',
        args=args.dataset,
        model_stride=model_without_ddp.backbone.stride)
    dataset_val = build_dataset(image_set='val',
                                args=args.dataset,
                                model_stride=model_without_ddp.backbone.stride)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    benchmark_test_parser = benchmark_test.get_args_parser()
    benchmark_test_args = benchmark_test_parser.get_defaults()
    benchmark_test_args.tracker.model = args.model  # overwrite the parameters about network model
    benchmark_test_args.result_path = Path(
        os.path.join(args.output_dir, 'benchmark'))
    benchmark_test_args.dataset_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'benchmark')

    benchmark_eval_parser = benchmark_eval.get_args_parser()
    benchmark_eval_args = benchmark_eval_parser.get_defaults()
    benchmark_eval_args.tracker_path = benchmark_test_args.result_path
    best_eao = 0
    best_ar = [0, 10]  # accuracy & robustness

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)

        # training
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every args.model_save_step epochs
            if (epoch + 1) % args.lr_drop == 0 or (
                    epoch + 1) % args.model_save_step == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

            # hack: only inference model
            utils.save_on_master({'model': model_without_ddp.state_dict()},
                                 output_dir / 'checkpoint_only_inference.pth')

        # evalute
        val_stats = evaluate(model, criterion, postprocessors, data_loader_val,
                             device, args.output_dir)

        log_stats = {
            'epoch': epoch,
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'val_{k}': v
               for k, v in val_stats.items()}, 'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

        # evualute with benchmark
        if utils.is_main_process():
            if (
                    epoch + 1
            ) % args.benchmark_test_step == 0 and epoch > args.benchmark_start_epoch:

                tracker = build_tracker(benchmark_test_args.tracker,
                                        model=model_without_ddp,
                                        postprocessors=postprocessors)
                benchmark_test_args.model_name = "epoch" + str(epoch)
                benchmark_start_time = time.time()
                benchmark_test.main(benchmark_test_args, tracker)
                benchmark_time = time.time() - benchmark_start_time

                benchmark_eval_args.model_name = "epoch" + str(epoch)
                benchmark_eval_args.tracker_prefix = "epoch" + str(epoch)
                eval_results = benchmark_eval.main(benchmark_eval_args)
                eval_result = list(eval_results.values())[0]

                if benchmark_test_args.dataset in ['VOT2018', 'VOT2019']:
                    if args.output_dir:
                        with (output_dir /
                              str("benchmark_" + benchmark_test_args.dataset +
                                  ".txt")).open("a") as f:
                            f.write("epoch: " + str(epoch) + ", best EAO: " +
                                    str(best_eao) + ", " +
                                    json.dumps(eval_result) + "\n")

                    if best_eao < eval_result['EAO']:

                        best_eao = eval_result['EAO']

                        if args.output_dir:
                            best_eao_int = int(best_eao * 1000)

                            # record: only inference model
                            utils.save_on_master(
                                {'model': model_without_ddp.state_dict()},
                                output_dir /
                                f'checkpoint{epoch:04}_best_eao_{best_eao_int:03}_only_inference.pth'
                            )

                    if best_ar[0] < eval_result['accuracy'] and best_ar[
                            1] > eval_result['robustness']:

                        best_ar[0] = eval_result['accuracy']
                        best_ar[1] = eval_result['robustness']

                        if args.output_dir:
                            best_accuracy_int = int(best_ar[0] * 1000)
                            best_robustness_int = int(best_ar[1] * 1000)

                            # record: only inference model
                            utils.save_on_master(
                                {'model': model_without_ddp.state_dict()},
                                output_dir /
                                f'checkpoint{epoch:04}_best_ar_{best_accuracy_int:03}_{best_robustness_int:03}_only_inference.pth'
                            )

                print("benchmark time: {}".format(benchmark_time))

        if args.distributed:
            torch.distributed.barrier()

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#16
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]

    dataset_val = build_dataset(image_set='val', args=args)

    sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    base_ds = get_coco_api_from_dataset(dataset_val)

    checkpoint = torch.load(args.resume)
    model_without_ddp.load_state_dict(checkpoint['model'])
    model_without_ddp.eval()
    model_without_ddp.to(device)
    header = 'Test:'
    for samples, targets in data_loader_val:
        samples = samples.to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        outputs = model_without_ddp(samples)
        probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
        keep = probas.max(-1).values > 0.7
        bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep])
        print(probas[keep], bboxes_scaled)
示例#17
0
文件: main.py 项目: bjuncek/detr
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print("number of params:", n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set="train", args=args)
    dataset_val = build_dataset(image_set="val", args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(
        dataset_train,
        batch_sampler=batch_sampler_train,
        collate_fn=utils.collate_fn,
        num_workers=args.num_workers,
    )
    data_loader_val = DataLoader(
        dataset_val,
        args.batch_size if args.batch_size < 4 else 4,
        sampler=sampler_val,
        drop_last=False,
        collate_fn=utils.collate_fn,
        num_workers=args.num_workers,
    )

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    elif args.dataset_file in ["cmdd", "cmdc", "wider"]:
        base_ds = None
    elif args.dataset_file == "MOT17":
        base_ds = dataset_val
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location="cpu")
        model_without_ddp.detr.load_state_dict(checkpoint["model"])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith("https"):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location="cpu",
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location="cpu")

        # NOTE: this is Bruno's hack to load stuff in
        model_dict = model_without_ddp.state_dict()
        pretrained_dict = checkpoint["model"]
        # hack for adding query stuff
        if ("query_embed.query_embed.weight" in model_dict.keys()
                and "query_embed.weight" in pretrained_dict.keys()):
            pretrained_dict[
                "query_embed.query_embed.weight"] = pretrained_dict[
                    "query_embed.weight"]
        # 1. filter out unnecessary keys
        pretrained_dict = {
            k: v
            for k, v in pretrained_dict.items() if k in model_dict
        }
        # if finetuning skip the linear stuff
        if args.finetune:
            pretrained_dict = {
                k: v
                for k, v in pretrained_dict.items()
                if k not in ["class_embed.weight", "class_embed.bias"]
            }
        # 2. overwrite entries in the existing state dict
        model_dict.update(pretrained_dict)
        # 3. load new state dict
        model_without_ddp.load_state_dict(model_dict)

        if (not args.eval and not args.load_model_only
                and "optimizer" in checkpoint and "lr_scheduler" in checkpoint
                and "epoch" in checkpoint):
            optimizer.load_state_dict(checkpoint["optimizer"])
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
            args.start_epoch = checkpoint["epoch"] + 1

    if args.eval:
        if args.test and args.dataset_file == "wider":
            if args.resume:
                s = args.resume.split("/")[:-1]
                output_dir = "/" + os.path.join(*s)
            else:
                output_dir = args.output_dir
            print("SAVING TEST WIDER TO ", output_dir)
            test_wider(
                model,
                criterion,
                postprocessors,
                dataset_val,
                data_loader_val,
                device,
                output_dir,
            )
            return
        test_stats, coco_evaluator = evaluate(
            model,
            criterion,
            postprocessors,
            data_loader_val,
            base_ds,
            device,
            args.output_dir,
            dset_file=args.dataset_file,
        )
        if args.output_dir and coco_evaluator is not None:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(
            model,
            criterion,
            data_loader_train,
            optimizer,
            device,
            epoch,
            args.clip_max_norm,
        )
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / "checkpoint.pth"]
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f"checkpoint{epoch:04}.pth")
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        "model": model_without_ddp.state_dict(),
                        "optimizer": optimizer.state_dict(),
                        "lr_scheduler": lr_scheduler.state_dict(),
                        "epoch": epoch,
                        "args": args,
                    },
                    checkpoint_path,
                )

        test_stats, coco_evaluator = evaluate(
            model,
            criterion,
            postprocessors,
            data_loader_val,
            base_ds,
            device,
            args.output_dir,
            dset_file=args.dataset_file,
        )

        log_stats = {
            **{f"train_{k}": v
               for k, v in train_stats.items()},
            **{f"test_{k}": v
               for k, v in test_stats.items()},
            "epoch": epoch,
            "n_parameters": n_parameters,
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / "eval").mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ["latest.pth"]
                    if epoch % 50 == 0:
                        filenames.append(f"{epoch:03}.pth")
                    for name in filenames:
                        torch.save(
                            coco_evaluator.coco_eval["bbox"].eval,
                            output_dir / "eval" / name,
                        )

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print("Training time {}".format(total_time_str))
示例#18
0
def main(args, exp_cfg):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print(args)

    # device = torch.device('cuda')
    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # model, criterion, postprocessors = build_model(args)
    model = SMPLXNet(exp_cfg)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)
    # for n, p in model_without_ddp.named_parameters():
    #     print(n)
    # dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)

    print('start build dataset')
    datasets = make_all_datasets(exp_cfg, split='train')
    # dataset_train = ConcatDataset(datasets['body'])
    dataset_train = ConcatDataset(datasets['body'] + datasets['hand'] +
                                  datasets['head'])

    print('finish build dataset')

    sample_weight = [
        child_dataset.sample_weight for child_dataset in dataset_train.datasets
    ]
    sample_weight = np.concatenate(sample_weight, axis=0)
    sampler_train = torch.utils.data.sampler.WeightedRandomSampler(
        sample_weight, len(dataset_train))
    # sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    if args.distributed:
        sampler_train = samplers.DistributedSampler(sampler_train)
        # sampler_val = samplers.DistributedSampler(sampler_val, shuffle=False)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    collate_fn = functools.partial(collate_batch,
                                   use_shared_memory=args.num_workers > 0,
                                   return_full_imgs=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)

    # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
    #                              drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers,
    #                              pin_memory=True)

    optim_cfg = exp_cfg.get('optim', {})
    optimizer = build_optimizer(model, optim_cfg)
    lr_scheduler = build_scheduler(optimizer, optim_cfg['scheduler'])

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.pretrain:
        checkpoint = torch.load(args.pretrain, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))

        if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            import copy
            p_groups = copy.deepcopy(optimizer.param_groups)
            optimizer.load_state_dict(checkpoint['optimizer'])
            for pg, pg_old in zip(optimizer.param_groups, p_groups):
                pg['lr'] = pg_old['lr']
                pg['initial_lr'] = pg_old['initial_lr']
            print(optimizer.param_groups)
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])

            lr_scheduler.step(lr_scheduler.last_epoch)
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start training")
    output_dir = Path(args.output_dir)
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, data_loader_train, optimizer,
                                      device, epoch)
        # print('DEBUG!!!!!!!!!'); train_stats = {}
        lr_scheduler.step()

        if args.output_dir:
            if not os.path.exists(args.output_dir) and utils.is_main_process():
                os.makedirs(args.output_dir)
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 1 epochs
            if (epoch + 1) % args.save_freq == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#19
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
    if args.det_val:
        assert args.eval, 'only support eval mode of detector for track'
        model, criterion, postprocessors = build_model(args)
    elif args.eval:
        model, criterion, postprocessors = build_tracktest_model(args)
    else:
        model, criterion, postprocessors = build_tracktrain_model(args)

    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    dataset_train = build_dataset(image_set=args.track_train_split, args=args)
    dataset_val = build_dataset(image_set=args.track_eval_split, args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val,
                                                          shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val,
                                                      shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers,
                                 pin_memory=True)

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    param_dicts = [{
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if not match_name_keywords(n, args.lr_backbone_names)
            and not match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters() if
            match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
        ],
        "lr":
        args.lr_backbone,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr * args.lr_linear_proj_mult,
    }]
    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts,
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            import copy
            p_groups = copy.deepcopy(optimizer.param_groups)
            optimizer.load_state_dict(checkpoint['optimizer'])
            for pg, pg_old in zip(optimizer.param_groups, p_groups):
                pg['lr'] = pg_old['lr']
                pg['initial_lr'] = pg_old['initial_lr']
            print(optimizer.param_groups)
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
            args.override_resumed_lr_drop = True
            if args.override_resumed_lr_drop:
                print(
                    'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.'
                )
                lr_scheduler.step_size = args.lr_drop
                lr_scheduler.base_lrs = list(
                    map(lambda group: group['initial_lr'],
                        optimizer.param_groups))
            lr_scheduler.step(lr_scheduler.last_epoch)
            args.start_epoch = checkpoint['epoch'] + 1
        # check the resumed model
#         if not args.eval:
#             test_stats, coco_evaluator, _ = evaluate(
#                 model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir
#             )

    if args.eval:
        assert args.batch_size == 1, print("Now only support 1.")
        tracker = Tracker(score_thresh=args.track_thresh)
        test_stats, coco_evaluator, res_tracks = evaluate(model,
                                                          criterion,
                                                          postprocessors,
                                                          data_loader_val,
                                                          base_ds,
                                                          device,
                                                          args.output_dir,
                                                          tracker=tracker,
                                                          phase='eval',
                                                          det_val=args.det_val,
                                                          fp16=args.fp16)
        if args.output_dir:
            #             utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
            if res_tracks is not None:
                print("Creating video index for {}.".format(args.dataset_file))
                video_to_images = defaultdict(list)
                video_names = defaultdict()
                for _, info in dataset_val.coco.imgs.items():
                    video_to_images[info["video_id"]].append({
                        "image_id":
                        info["id"],
                        "frame_id":
                        info["frame_id"]
                    })
                    video_name = info["file_name"].split("/")[0]
                    if video_name not in video_names:
                        video_names[info["video_id"]] = video_name
                assert len(video_to_images) == len(video_names)
                # save mot results.
                save_track(res_tracks, args.output_dir, video_to_images,
                           video_names, args.track_eval_split)

        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model,
                                      criterion,
                                      data_loader_train,
                                      optimizer,
                                      device,
                                      scaler,
                                      epoch,
                                      args.clip_max_norm,
                                      fp16=args.fp16)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if epoch % 10 == 0 or epoch > args.epochs - 5:
            test_stats, coco_evaluator, _ = evaluate(model,
                                                     criterion,
                                                     postprocessors,
                                                     data_loader_val,
                                                     base_ds,
                                                     device,
                                                     args.output_dir,
                                                     fp16=args.fp16)
            log_test_stats = {
                **{f'test_{k}': v
                   for k, v in test_stats.items()}
            }
            log_stats.update(log_test_stats)

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs


#             if coco_evaluator is not None:
#                 (output_dir / 'eval').mkdir(exist_ok=True)
#                 if "bbox" in coco_evaluator.coco_eval:
#                     filenames = ['latest.pth']
#                     if epoch % 50 == 0:
#                         filenames.append(f'{epoch:03}.pth')
#                     for name in filenames:
#                         torch.save(coco_evaluator.coco_eval["bbox"].eval,
#                                    output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#20
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    # align with DETR format
    args.dataset_file = 'ImageNet'
    args.masks = None
    # freeze cnn weights
    args.lr_backbone = 0 if args.fre_cnn else args.lr
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.updetr_collate_fn,
                                   num_workers=args.num_workers)

    print(len(data_loader_train) * args.epochs)

    output_dir = Path(args.output_dir)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            if lr_scheduler.step_size != args.lr_drop:
                lr_scheduler.step_size = args.lr_drop
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 20 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 20 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#21
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    Dataset = get_dataset(args.dataset, args.task)
    f = open(args.data_cfg)
    data_config = json.load(f)
    trainset_paths = data_config['train']
    dataset_root = data_config['root']
    f.close()

    normalize = T.Compose([
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]

    transforms = T.Compose([
        T.RandomHorizontalFlip(),
        T.RandomSelect(
            T.RandomResize(scales, max_size=1333),
            T.Compose([
                T.RandomResize([400, 500, 600]),
                T.RandomSizeCrop(384, 600),
                # T.RandomSizeCrop_MOT(384, 600),
                T.RandomResize(scales, max_size=1333),
            ])),
        normalize,
    ])
    dataset_train = Dataset(args,
                            dataset_root,
                            trainset_paths, (1088, 608),
                            augment=True,
                            transforms=transforms)
    args.nID = dataset_train.nID

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    # dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            # sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            # sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        # sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)

    # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
    #                              drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers,
    #                              pin_memory=True)

    # data_loader_train = torch.utils.data.DataLoader(
    #     dataset_train,
    #     batch_size=args.batch_size,
    #     shuffle=True,
    #     num_workers=args.num_workers,
    #     pin_memory=True,
    #     drop_last=True
    # )

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    # 用于将classifer不更新参数
    # for name,p in model_without_ddp.named_parameters():
    #     if name.startswith('classifier'):
    #         p.requires_grad = False

    param_dicts = [{
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if not match_name_keywords(n, args.lr_backbone_names)
            and not match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters() if
            match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
        ],
        "lr":
        args.lr_backbone,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr * args.lr_linear_proj_mult,
    }]
    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts,
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
    # optimizer.add_param_group({'params': criterion.parameters()})

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)

    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
            model_dict = model_without_ddp.state_dict()  #当前模型参数
            pretrained_dict = {
                k: v
                for k, v in checkpoint['model'].items() if k not in [
                    "class_embed.0.weight", "class_embed.0.bias",
                    "class_embed.1.weight", "class_embed.1.bias",
                    "class_embed.2.weight", "class_embed.2.bias",
                    "class_embed.3.weight", "class_embed.3.bias",
                    "class_embed.4.weight", "class_embed.4.bias",
                    "class_embed.5.weight", "class_embed.5.bias"
                ]
            }
            model_dict.update(pretrained_dict)

        # missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            model_dict, strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:

            args.start_epoch = checkpoint['epoch'] + 1
            # optimizer.load_state_dict(checkpoint['optimizer'])
        # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
        #     import copy
        #     p_groups = copy.deepcopy(optimizer.param_groups)
        #     # optimizer.load_state_dict(checkpoint['optimizer'])
        # for pg, pg_old in zip(optimizer.param_groups, p_groups):
        #     pg['lr'] = pg_old['lr']
        #     pg['initial_lr'] = pg_old['initial_lr']
        # # print(optimizer.param_groups)
        # lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
        # args.override_resumed_lr_drop = True
        # if args.override_resumed_lr_drop:
        #     print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
        #     lr_scheduler.step_size = args.lr_drop
        #     lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        # lr_scheduler.step(lr_scheduler.last_epoch)

    # model.add_module('id')

    # [p for p in model.named_parameters() if not p[1].requires_grad]
    # 用于将classifer不更新参数
    # optimizer = torch.optim.SGD(filter(lambda x: "classifier" not in x[0], model.parameters()), lr=args.lr,
    #                 momentum=0.9, weight_decay=1e-4)
    # model.classifier.training = False
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)
    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(args, model, criterion,
                                      data_loader_train, optimizer, device,
                                      epoch, args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#22
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)
    # IPython.embed()
    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # IPython.embed()
    # IPython.embed()
    os.system("sudo chmod -R 777 /home/shuxuang/.cache/")
    model, criterion, postprocessors = build_model(
        args)  # use the same model as detr paper on coco
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    dataset_val = build_nvdataset(
        dataset_root=[
            os.path.join(os.environ["HOME"], 'datasets/detection-f'),  #test
            os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')
        ],
        mode='test',
        camera=args.camera)
    # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql],
    #                               mode='test', camera=args.camera)

    print("Validation samples: %d" % (len(dataset_val)))

    IPython.embed()
    # compute how many boxes in the test dataset for each image
    # accumulate_bboxes_numbers(dataset_val)
    # dataset_train_ = build_nvdataset(dataset_root=[
    #                                     os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'),
    #                                     os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')],
    #                                 mode='train', camera=args.camera)

    # # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy'))
    # # dataset_train = Subset(dataset_train_, indices_50k)
    # print("Train samples: %d"%(len(dataset_train_)))
    # print(len(dataset_val))

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth')
    # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth'
    log_path = args.resume
    log = os.path.join(args.resume, 'log.txt')
    # read_log(log)
    # IPython.embed()
    args.resume = os.path.join(args.resume, 'checkpoint.pth')
    print(args.resume)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            print('Loading model: %s' % args.resume)
            checkpoint = torch.load(args.resume, map_location='cpu')
        print('Load model from %d epoch' % checkpoint['epoch'])
        model_without_ddp.load_state_dict(checkpoint['model'])

    if args.eval:
        vis_bboxes(model, dataset_val, postprocessors, device)
        # inference_time(model, dataset_val, postprocessors, device)
    return model, dataset_val, postprocessors, device
示例#23
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)
    dataset_test = build_dataset(image_set='test', args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val,
                                                          shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val,
                                                      shuffle=False)
            sampler_test = samplers.DistributedSampler(dataset_test,
                                                       shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
        sampler_test = torch.utils.data.SequentialSampler(dataset_test)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers,
                                 pin_memory=True)
    data_loader_test = DataLoader(dataset_test,
                                  args.batch_size,
                                  sampler=sampler_val,
                                  drop_last=False,
                                  collate_fn=utils.collate_fn,
                                  num_workers=args.num_workers,
                                  pin_memory=True)

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    param_dicts = [{
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if not match_name_keywords(n, args.lr_backbone_names)
            and not match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters() if
            match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
        ],
        "lr":
        args.lr_backbone,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr * args.lr_linear_proj_mult,
    }]
    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts,
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        del checkpoint["model"]["transformer.decoder.class_embed.0.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.0.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.1.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.1.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.2.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.2.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.3.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.3.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.4.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.4.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.5.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.5.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.6.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.6.bias"]
        del checkpoint["model"]["class_embed.0.weight"]
        del checkpoint["model"]["class_embed.0.bias"]
        del checkpoint["model"]["class_embed.1.weight"]
        del checkpoint["model"]["class_embed.1.bias"]
        del checkpoint["model"]["class_embed.2.weight"]
        del checkpoint["model"]["class_embed.2.bias"]
        del checkpoint["model"]["class_embed.3.weight"]
        del checkpoint["model"]["class_embed.3.bias"]
        del checkpoint["model"]["class_embed.4.weight"]
        del checkpoint["model"]["class_embed.4.bias"]
        del checkpoint["model"]["class_embed.5.weight"]
        del checkpoint["model"]["class_embed.5.bias"]
        del checkpoint["model"]["class_embed.6.weight"]
        del checkpoint["model"]["class_embed.6.bias"]
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        # if len(missing_keys) > 0:
        #     print('Missing Keys: {}'.format(missing_keys))
        # if len(unexpected_keys) > 0:
        #     print('Unexpected Keys: {}'.format(unexpected_keys))
        # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
        #     import copy
        #     p_groups = copy.deepcopy(optimizer.param_groups)
        #     optimizer.load_state_dict(checkpoint['optimizer'])
        #     for pg, pg_old in zip(optimizer.param_groups, p_groups):
        #         pg['lr'] = pg_old['lr']
        #         pg['initial_lr'] = pg_old['initial_lr']
        #     #print(optimizer.param_groups)
        #     lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        #     # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
        #     args.override_resumed_lr_drop = True
        #     if args.override_resumed_lr_drop:
        #         print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
        #         lr_scheduler.step_size = args.lr_drop
        #         lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        #     lr_scheduler.step(lr_scheduler.last_epoch)
        #     args.start_epoch = checkpoint['epoch'] + 1
        # # check the resumed model
        if not args.eval:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return
    if args.test:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_test, base_ds,
                                              device, args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#24
0
文件: main.py 项目: bearcatt/detr
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(split='train', args=args)
    dataset_val = build_dataset(split='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    elif args.dataset_file == "coco":
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    # if args.eval:
    #     if 'coco' in args.dataset_file:
    #         test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
    #                                             data_loader_val, base_ds, device, args.output_dir)
    #         if args.output_dir:
    #             utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
    #     elif 'anet' == args.dataset_file:
    #         evaluate3d(model, postprocessors, data_loader_val, device, epoch=0)
    #     return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        if epoch % args.eval_freq == 0:
            if 'coco' in args.dataset_file:
                test_stats, coco_evaluator = evaluate(model, criterion,
                                                      postprocessors,
                                                      data_loader_val, base_ds,
                                                      device, args.output_dir)
            elif 'anet' == args.dataset_file:
                evaluate3d(model, postprocessors, data_loader_val, device,
                           epoch)
示例#25
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # IPython.embed()
    # IPython.embed()
    os.system("sudo chmod -R 777 /home/shuxuang/.cache/")
    model, criterion, postprocessors = build_model(
        args)  # use the same model as detr paper on coco
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
    #                               weight_decay=args.weight_decay)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)
    # modify the dataset from coco to nvdata
    # home_dir = os.environ["HOME"]
    # # on local
    # dataset_train_ = build_nvdataset(dataset_root=[
    #                                     os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'),
    #                                     os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')],
    #                                 mode='train')
    # dataset_val = build_nvdataset(dataset_root=[
    #                                 os.path.join(os.environ["HOME"],'datasets/test'),
    #                                 os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')],
    #                               mode='test', camera=args.camera)
    dataset_val = build_nvdataset(dataset_root=[
        os.path.join(os.environ["HOME"], 'datasets/test'),
        os.path.join(os.environ["HOME"], 'datasets/test')
    ],
                                  mode='test',
                                  camera=args.camera)
    # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy'))
    # # on maglev
    # dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql,  args.dataset_root_img],
    #                                  mode='train')
    # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql],
    #                               mode='test', camera=args.camera)
    # indices_50k =np.load(os.path.join(args.root_indices))

    # dataset_train = Subset(dataset_train_, indices_50k)
    print("Validation samples: %d" % (len(dataset_val)))
    # IPython.embed()

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth')
    # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth'
    print(args.resume)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            print('Loading model: %s' % args.resume)
            checkpoint = torch.load(args.resume, map_location='cpu')
        print('Load model from %d epoch' % (checkpoint['epoch'] + 1))
        model_without_ddp.load_state_dict(checkpoint['model'])

    if args.eval:
        if args.dataset_file == 'nvdata':
            evaluate(model, dataset_val, postprocessors, device)
        else:
            evaluate_5classes(model, dataset_val, postprocessors, device)
    return model, dataset_val, postprocessors, device
示例#26
0
def main(args):
    utils.init_distributed_mode(args)

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 batch_size=1,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        io.load_frozen(args, model_without_ddp)

    output_dir = Path(args.output_dir)
    if args.resume:
        io.resume(args, model_without_ddp, optimizer, lr_scheduler)

    elif args.finetune:
        io.finetune(args, model_without_ddp)

    if args.eval:

        if args.output_dir and utils.is_main_process():
            io.init_wandb(args.dataset_file + "-detr-eval",
                          model,
                          args,
                          n_parameters=n_parameters)

        test_stats, evaluator = evaluate(model, criterion, postprocessors,
                                         data_loader_val, base_ds, device,
                                         args.output_dir)
        if args.output_dir:
            io.save_on_master(evaluator.coco_eval["bbox"].eval,
                              output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()

    if args.output_dir and utils.is_main_process():
        io.init_wandb(args.dataset_file + "-detr",
                      model,
                      args,
                      n_parameters=n_parameters)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                io.save_checkpoint(args, model_without_ddp, optimizer,
                                   lr_scheduler, epoch)

        test_stats, evaluator = evaluate(model, criterion, postprocessors,
                                         data_loader_val, base_ds, device,
                                         args.output_dir, epoch)

        if utils.is_main_process() and args.output_dir:
            io.log_wandb(train_stats, test_stats)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))

    # save final model
    if utils.is_main_process() and args.output_dir:
        io.save_on_master(model_without_ddp, output_dir / "model_final.pth")

    print('Training time {}'.format(total_time_str))
示例#27
0
def main(args):
    # args = parser.parse_args()
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print(args)

    if args.seed is not None:
        # random.seed(args.seed)
        # torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

        # fix the seed for reproducibility
        seed = args.seed + utils.get_rank()
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

    ##################################
    # Logging setting
    ##################################
    if args.output_dir and utils.is_main_process():
        logging.basicConfig(
            filename=os.path.join(args.output_dir, args.log_name),
            filemode='w',
            format=
            '%(asctime)s: %(levelname)s: [%(filename)s:%(lineno)d]: %(message)s',
            level=logging.INFO)
    warnings.filterwarnings("ignore")

    ##################################
    # Save to logging
    ##################################
    if utils.is_main_process():
        logging.info(str(args))

    ##################################
    # Initialize dataset
    ##################################

    if not args.evaluate:
        # build_vocab_flag=True, # Takes a long time to build a vocab
        train_dataset = GQATorchDataset(split='train_unbiased',
                                        build_vocab_flag=False,
                                        load_vocab_flag=False)

        if args.distributed:
            sampler_train = torch.utils.data.DistributedSampler(train_dataset)
        else:
            sampler_train = torch.utils.data.RandomSampler(train_dataset)

        batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                            args.batch_size,
                                                            drop_last=True)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_sampler=batch_sampler_train,
            collate_fn=GQATorchDataset_collate_fn,
            num_workers=args.workers)

        # Old version
        # train_loader = torch.utils.data.DataLoader(
        #     train_dataset, batch_size=args.batch_size, shuffle=True,
        #     collate_fn=GQATorchDataset_collate_fn,
        #     num_workers=args.workers, pin_memory=True)

    val_dataset_list = []
    for eval_split in args.evaluate_sets:
        val_dataset_list.append(
            GQATorchDataset(split=eval_split,
                            build_vocab_flag=False,
                            load_vocab_flag=args.evaluate))
    val_dataset = torch.utils.data.ConcatDataset(val_dataset_list)

    if args.distributed:
        sampler_val = torch.utils.data.DistributedSampler(val_dataset,
                                                          shuffle=False)
    else:
        sampler_val = torch.utils.data.SequentialSampler(val_dataset)

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        sampler=sampler_val,
        drop_last=False,
        collate_fn=GQATorchDataset_collate_fn,
        num_workers=args.workers)

    # Old version
    # val_loader = torch.utils.data.DataLoader(
    #     val_dataset,
    #     batch_size=args.batch_size, shuffle=False,
    #     collate_fn=GQATorchDataset_collate_fn,
    #     num_workers=args.workers, pin_memory=True)

    ##################################
    # Initialize model
    # - note: must init dataset first. Since we will use the vocab from the dataset
    ##################################
    model = PipelineModel()

    ##################################
    # Deploy model on GPU
    ##################################
    model = model.to(device=cuda)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    ##################################
    # define optimizer (and scheduler)
    ##################################

    # optimizer = torch.optim.SGD(model.parameters(), args.lr,
    #                             momentum=args.momentum,
    #                             weight_decay=args.weight_decay)
    optimizer = torch.optim.Adam(
        params=model.parameters(),
        lr=args.lr,
        betas=(0.9, 0.999),
        eps=1e-08,
        weight_decay=0,  #  weight_decay=args.weight_decay
        amsgrad=False,
    )
    # optimizer = torch.optim.AdamW(
    #     params=model.parameters(),
    #     lr=args.lr,
    #     weight_decay=args.weight_decay)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            model_without_ddp.load_state_dict(checkpoint['model'])
            if not args.evaluate:
                if 'optimizer' in checkpoint:
                    optimizer.load_state_dict(checkpoint['optimizer'])
                if 'lr_scheduler' in checkpoint:
                    lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
                if 'epoch' in checkpoint:
                    args.start_epoch = checkpoint['epoch'] + 1

            # checkpoint = torch.load(args.resume)
            # args.start_epoch = checkpoint['epoch']
            # model.load_state_dict(checkpoint['state_dict'])
            # optimizer.load_state_dict(checkpoint['optimizer'])
            # print("=> loaded checkpoint '{}' (epoch {})"
            #       .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # cudnn.benchmark = True

    ##################################
    # Define loss functions (criterion)
    ##################################
    # criterion = torch.nn.CrossEntropyLoss().cuda()

    text_pad_idx = GQATorchDataset.TEXT.vocab.stoi[
        GQATorchDataset.TEXT.pad_token]
    criterion = {
        "program":
        torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda),
        "full_answer":
        torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda),
        "short_answer":
        torch.nn.CrossEntropyLoss().to(device=cuda),
        # "short_answer": torch.nn.BCEWithLogitsLoss().to(device=cuda), # sigmoid
        "execution_bitmap":
        torch.nn.BCELoss().to(device=cuda),
    }

    ##################################
    # If Evaluate Only
    ##################################

    if args.evaluate:
        validate(val_loader, model, criterion, args, DUMP_RESULT=True)
        return

    ##################################
    # Main Training Loop
    ##################################

    # best_acc1 = 0
    for epoch in range(args.start_epoch, args.epochs):

        if args.distributed:
            ##################################
            # In distributed mode, calling the :meth`set_epoch(epoch) <set_epoch>` method
            # at the beginning of each epoch before creating the DataLoader iterator is necessary
            # to make shuffling work properly across multiple epochs.
            # Otherwise, the same ordering will be always used.
            ##################################
            sampler_train.set_epoch(epoch)

        lr_scheduler.step()

        # adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)
        # evaluate on validation set
        if (epoch + 1) % 5 == 0:
            validate(val_loader,
                     model,
                     criterion,
                     args,
                     FAST_VALIDATE_FLAG=False)

        # # remember best acc@1 and save checkpoint
        # save_checkpoint({
        #     'epoch': epoch + 1,
        #     # 'arch': args.arch,
        #     'state_dict': model.state_dict(),
        #     # 'best_acc1': best_acc1,
        #     'optimizer' : optimizer.state_dict(),
        # }, is_best)

        if args.output_dir:
            output_dir = pathlib.Path(args.output_dir)
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)
示例#28
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    #cab
    writer = SummaryWriter("runs/" + args.tb_name)

    best_value = 0

    print("Start training, best_value is " + str(best_value))
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        #cab
        for k, v in train_stats.items():
            if isinstance(v, float):
                writer.add_scalar(f'train_{k}', v, epoch)

        new_value = 0
        for k, v in test_stats.items():
            if (isinstance(v, float)):
                writer.add_scalar(f'test_{k}', v, epoch)
            if (k == "coco_eval_bbox"):
                new_value = v[0]
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
                    v[0], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ]',
                    v[1], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ]',
                    v[2], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
                    v[3], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
                    v[4], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
                    v[5], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ]',
                    v[6], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ]',
                    v[7], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
                    v[8], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
                    v[9], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
                    v[10], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
                    v[11], epoch)

            if (k == "coco_eval_masks"):
                new_value = v[0]
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
                    v[0], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ]',
                    v[1], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ]',
                    v[2], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
                    v[3], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
                    v[4], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
                    v[5], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ]',
                    v[6], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ]',
                    v[7], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
                    v[8], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
                    v[9], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
                    v[10], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
                    v[11], epoch)

        print("Epoch finished, best_value is " + str(best_value))

        save_pth = False
        if best_value < new_value:
            best_value = new_value
            save_pth = True

        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')

            if save_pth:
                checkpoint_paths.append(output_dir / f'best.pth')
                bestLog = open(output_dir / 'best_log.txt', 'w+')
                bestLog.write(f'Saved model at epoch {epoch:04}\n')

            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        #/cab

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#29
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   args.lr_drop,
                                                   gamma=0.9)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    output_dir = output_dir / f"{args.backbone}_{args.transformer_type}"
    if args.output_dir:
        output_dir.mkdir(parents=True, exist_ok=True)

    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')

        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / f'checkpoint_{epoch}.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch}_extra.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
示例#30
0
def main(args):
    utils.init_distributed_mode(args)

    device = torch.device(args.device)
    args.batch_size = 1
    print(args)

    if args.dataset_config is not None:
        # https://stackoverflow.com/a/16878364
        d = vars(args)
        with open(args.dataset_config, "r") as f:
            cfg = json.load(f)
        d.update(cfg)

    # fix the seed for reproducibility
    seed = args.seed + dist.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    if args.resume.startswith("https"):
        checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True)
    else:
        checkpoint = torch.load(args.resume, map_location="cpu")

    model_args = checkpoint["args"]
    for a in vars(args):
        if a not in vars(model_args):
            vars(model_args)[a] = vars(args)[a]

    model_args.device = args.device
    model, _, _, _, _ = build_model(model_args)
    model.to(device)
    with open(Path(args.lvis_minival_path) / "lvis_v1_minival.json", "r") as f:
        lvis_val = json.load(f)
    id2cat = {c["id"]: c for c in lvis_val["categories"]}
    all_cats = sorted(list(id2cat.keys()))

    # model_without_ddp = model
    # if args.distributed:
    #    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
    #    model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("number of params:", n_parameters)

    dset = build_dataset("lvis", image_set="minival", args=args)
    sampler = DistributedSampler(dset, shuffle=False) if args.distributed else torch.utils.data.SequentialSampler(dset)
    dataloader = DataLoader(
        dset,
        args.batch_size,
        sampler=sampler,
        drop_last=False,
        collate_fn=partial(utils.collate_fn, False),
        num_workers=args.num_workers,
    )

    if args.test:
        evaluator = LvisDumper(fixed_ap=True, out_path=os.path.join(args.output_dir, "lvis_eval"))
    else:
        evaluator = LvisEvaluatorFixedAP(dset.lvis, fixed_ap=True)

    postprocessor = build_postprocessors(args, "lvis")
    model.load_state_dict(checkpoint["model_ema"], strict=False)
    model.eval()
    label_set = torch.as_tensor(list(all_cats))
    print("label_set", len(label_set))
    text_memories = []
    print("encoding text...")
    splits = torch.split(label_set, 32)
    for split in tqdm(splits):
        captions = [f"{clean_name(id2cat[l]['name'])}" for l in split.tolist()]
        tokenized = model.transformer.tokenizer.batch_encode_plus(captions, padding="longest", return_tensors="pt").to(
            device
        )
        encoded_text = model.transformer.text_encoder(**tokenized)

        # Transpose memory because pytorch's attention expects sequence first
        text_memory = encoded_text.last_hidden_state.transpose(0, 1)
        # Invert attention mask that we get from huggingface because its the opposite in pytorch transformer
        text_attention_mask = tokenized.attention_mask.ne(1).bool()

        # Resize the encoder hidden states to be of the same d_model as the decoder
        text_memory_resized = model.transformer.resizer(text_memory)

        text_memories.append((text_attention_mask, text_memory_resized, tokenized))

    for batch_dict in tqdm(dataloader):

        samples = batch_dict["samples"].to(device)
        targets = batch_dict["targets"]
        targets = [
            {
                k: v.to(device) if k not in ["tokens_positive", "tokens", "dataset_name"] else v
                for k, v in t.items()
                if k != "caption"
            }
            for t in targets
        ]

        assert len(targets) == 1
        t = targets[0]

        with torch.no_grad():
            features, orig_pos = model.backbone(samples)
            orig_src, orig_mask = features[-1].decompose()
            res = []
            for i in range(len(text_memories)):
                # captions = [f"{clean_name(id2cat[l]['name'])}" for l in split.tolist()]
                bs = len(splits[i])
                src = orig_src.repeat(bs, 1, 1, 1)
                mask = orig_mask.repeat(bs, 1, 1, 1)
                pos = deepcopy(orig_pos)
                pos[0] = pos[0].repeat(bs, 1, 1, 1)

                memory_cache = model.transformer(
                    model.input_proj(src),
                    mask,
                    model.query_embed.weight,
                    pos[-1],
                    text_memories[i],  # captions,
                    encode_and_save=True,
                    text_memory=None,
                    img_memory=None,
                    text_attention_mask=None,
                )
                out = model(samples, captions, encode_and_save=False, memory_cache=memory_cache)
                orig_target_sizes = torch.stack([t["orig_size"] for _ in range(bs)], dim=0)
                results = postprocessor["bbox"](out, orig_target_sizes)

                assert len(results) == len(splits[i])
                for j in range(len(results)):
                    results[j]["labels"] *= splits[i][j].item()

                for output in results:
                    res.append((t["image_id"].item(), output))
            evaluator.update(res)
    evaluator.synchronize_between_processes()
    evaluator.summarize()