示例#1
0
def test_coco_detection():
    dummy_results = {
        0: {
            0: [[0, 0, 20, 20, 1]],
            1: [[0, 0, 20, 20, 1]]
        },
        1: {
            0: [[0, 0, 20, 20, 1]]
        },
    }

    cfg = dict(
        name="CocoDataset",
        img_path="./tests/data",
        ann_path="./tests/data/dummy_coco.json",
        input_size=[320, 320],  # [w,h]
        keep_ratio=True,
        pipeline=dict(
            normalize=[[103.53, 116.28, 123.675], [57.375, 57.12, 58.395]]),
    )
    dataset = build_dataset(cfg, "train")

    eval_cfg = dict(name="CocoDetectionEvaluator", save_key="mAP")

    evaluator = build_evaluator(eval_cfg, dataset)
    tmp_dir = tempfile.TemporaryDirectory()
    eval_results = evaluator.evaluate(results=dummy_results,
                                      save_dir=tmp_dir.name,
                                      rank=-1)
    assert eval_results["mAP"] == 1
示例#2
0
def main(args):
    load_config(cfg, args.config)
    local_rank = -1
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    cfg.defrost()
    timestr = datetime.datetime.now().__format__("%Y%m%d%H%M%S")
    cfg.save_dir = os.path.join(cfg.save_dir, timestr)
    mkdir(local_rank, cfg.save_dir)
    logger = NanoDetLightningLogger(cfg.save_dir)

    assert args.task in ["val", "test"]
    cfg.update({"test_mode": args.task})

    logger.info("Setting up data...")
    val_dataset = build_dataset(cfg.data.val, args.task)
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=naive_collate,
        drop_last=False,
    )
    evaluator = build_evaluator(cfg.evaluator, val_dataset)

    logger.info("Creating model...")
    task = TrainingTask(cfg, evaluator)

    ckpt = torch.load(args.model)
    if "pytorch-lightning_version" not in ckpt:
        warnings.warn(
            "Warning! Old .pth checkpoint is deprecated. "
            "Convert the checkpoint with tools/convert_old_checkpoint.py ")
        ckpt = convert_old_model(ckpt)
    task.load_state_dict(ckpt["state_dict"])

    if cfg.device.gpu_ids == -1:
        logger.info("Using CPU training")
        accelerator, devices = "cpu", None
    else:
        accelerator, devices = "gpu", cfg.device.gpu_ids

    trainer = pl.Trainer(
        default_root_dir=cfg.save_dir,
        accelerator=accelerator,
        devices=devices,
        log_every_n_steps=cfg.log.interval,
        num_sanity_val_steps=0,
        logger=logger,
    )
    logger.info("Starting testing...")
    trainer.test(task, val_dataloader)
示例#3
0
def main(args):
    load_config(cfg, args.config)
    if cfg.model.arch.head.num_classes != len(cfg.class_names):
        raise ValueError('cfg.model.arch.head.num_classes must equal len(cfg.class_names),but got {} and {}'.format(cfg.model.arch.head.num_classes,len(cfg.class_names)))
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        pl.seed_everything(args.seed)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    evaluator = build_evaluator(cfg, val_dataset)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                   shuffle=True, num_workers=cfg.device.workers_per_gpu,
                                                   pin_memory=True, collate_fn=collate_function, drop_last=True)
    # TODO: batch eval
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False,
                                                 num_workers=cfg.device.workers_per_gpu,
                                                 pin_memory=True, collate_fn=collate_function, drop_last=True)

    logger.log('Creating model...')
    task = TrainingTask(cfg, evaluator)

    if 'load_model' in cfg.schedule:
        ckpt = torch.load(cfg.schedule.load_model)
        if 'pytorch-lightning_version' not in ckpt:
            warnings.warn('Warning! Old .pth checkpoint is deprecated. '
                          'Convert the checkpoint with tools/convert_old_checkpoint.py ')
            ckpt = convert_old_model(ckpt)
        task.load_state_dict(ckpt['state_dict'], strict=False)

    model_resume_path = os.path.join(cfg.save_dir, 'model_last.ckpt') if 'resume' in cfg.schedule else None

    trainer = pl.Trainer(default_root_dir=cfg.save_dir,
                         max_epochs=cfg.schedule.total_epochs,
                         gpus=cfg.device.gpu_ids,
                         check_val_every_n_epoch=cfg.schedule.val_intervals,
                         accelerator='ddp',
                         log_every_n_steps=cfg.log.interval,
                         num_sanity_val_steps=0,
                         resume_from_checkpoint=model_resume_path,
                         callbacks=[ProgressBar(refresh_rate=0)]  # disable tqdm bar
                         )

    trainer.fit(task, train_dataloader, val_dataloader)
示例#4
0
def main(args):
    warnings.warn(
        'Warning! Old testing code is deprecated and will be deleted '
        'in next version. Please use tools/test.py')
    load_config(cfg, args.config)
    local_rank = -1
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    cfg.defrost()
    timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S')
    cfg.save_dir = os.path.join(cfg.save_dir, timestr)
    cfg.freeze()
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    val_dataset = build_dataset(cfg.data.val, args.task)
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=collate_function,
        drop_last=True)
    trainer = build_trainer(local_rank, cfg, model, logger)
    cfg.schedule.update({'load_model': args.model})
    trainer.load_model(cfg)
    evaluator = build_evaluator(cfg, val_dataset)
    logger.log('Starting testing...')
    with torch.no_grad():
        results, val_loss_dict = trainer.run_epoch(0,
                                                   val_dataloader,
                                                   mode=args.task)
    if args.task == 'test':
        res_json = evaluator.results2json(results)
        json_path = os.path.join(cfg.save_dir,
                                 'results{}.json'.format(timestr))
        json.dump(res_json, open(json_path, 'w'))
    elif args.task == 'val':
        eval_results = evaluator.evaluate(results,
                                          cfg.save_dir,
                                          rank=local_rank)
        if args.save_result:
            txt_path = os.path.join(cfg.save_dir,
                                    "eval_results{}.txt".format(timestr))
            with open(txt_path, "a") as f:
                for k, v in eval_results.items():
                    f.write("{}: {}\n".format(k, v))
示例#5
0
def main(args):
    warnings.warn('Warning! Old training code is deprecated and will be deleted '
                  'in next version. Please use tools/train.py')
    load_config(cfg, args.config)
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)                             # mkdir用@rank_filter包裹,主进程创建save_dir
    logger = Logger(local_rank, cfg.save_dir)
    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        init_seeds(args.seed)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    if len(cfg.device.gpu_ids) > 1:
        print('rank = ', local_rank)
        num_gpus = torch.cuda.device_count()
        torch.cuda.set_device(local_rank % num_gpus)
        dist.init_process_group(backend='nccl')
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                       num_workers=cfg.device.workers_per_gpu, pin_memory=True,
                                                       collate_fn=collate_function, sampler=train_sampler,
                                                       drop_last=True)
    else:
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                       shuffle=True, num_workers=cfg.device.workers_per_gpu,
                                                       pin_memory=True, collate_fn=collate_function, drop_last=True)

    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=cfg.device.batchsize_per_gpu,
                                                 shuffle=False, num_workers=cfg.device.workers_per_gpu,
                                                 pin_memory=True, collate_fn=collate_function, drop_last=True)

    trainer = build_trainer(local_rank, cfg, model, logger)

    if 'load_model' in cfg.schedule:
        trainer.load_model(cfg)
    if 'resume' in cfg.schedule:
        trainer.resume(cfg)

    evaluator = build_evaluator(cfg, val_dataset)

    logger.log('Starting training...')
    trainer.run(train_dataloader, val_dataloader, evaluator)
示例#6
0
文件: train_pl.py 项目: wwdok/nanodet
def main(args):
    load_config(cfg, args.config)
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)
    # TODO: replace with lightning random seed
    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        init_seeds(args.seed)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    evaluator = build_evaluator(cfg, val_dataset)

    logger.log('Creating model...')
    task = TrainingTask(cfg, evaluator, logger)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=True,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=collate_function,
        drop_last=True)
    # TODO: batch eval
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=1,
                                                 shuffle=False,
                                                 num_workers=1,
                                                 pin_memory=True,
                                                 collate_fn=collate_function,
                                                 drop_last=True)

    trainer = pl.Trainer(default_root_dir=cfg.save_dir,
                         max_epochs=cfg.schedule.total_epochs,
                         gpus=cfg.device.gpu_ids,
                         check_val_every_n_epoch=cfg.schedule.val_intervals,
                         accelerator='ddp',
                         log_every_n_steps=cfg.log.interval,
                         num_sanity_val_steps=0)

    trainer.fit(task, train_dataloader, val_dataloader)
示例#7
0
def main(args):
    load_config(cfg, args.config)
    local_rank = -1
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    cfg.defrost()
    timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S')
    cfg.save_dir = os.path.join(cfg.save_dir, timestr)
    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    assert args.task in ['val', 'test']
    cfg.update({'test_mode': args.task})

    logger.log('Setting up data...')
    val_dataset = build_dataset(cfg.data.val, args.task)
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=collate_function,
        drop_last=True)
    evaluator = build_evaluator(cfg, val_dataset)

    logger.log('Creating model...')
    task = TrainingTask(cfg, evaluator)

    ckpt = torch.load(args.model)
    if 'pytorch-lightning_version' not in ckpt:
        warnings.warn(
            'Warning! Old .pth checkpoint is deprecated. '
            'Convert the checkpoint with tools/convert_old_checkpoint.py ')
        ckpt = convert_old_model(ckpt)
    task.load_state_dict(ckpt['state_dict'])

    trainer = pl.Trainer(
        default_root_dir=cfg.save_dir,
        gpus=cfg.device.gpu_ids,
        accelerator='ddp',
        log_every_n_steps=cfg.log.interval,
        num_sanity_val_steps=0,
    )
    logger.log('Starting testing...')
    trainer.test(task, val_dataloader)
示例#8
0
def main(args):
    load_config(cfg, args.config)
    if cfg.model.arch.head.num_classes != len(cfg.class_names):
        raise ValueError(
            "cfg.model.arch.head.num_classes must equal len(cfg.class_names), "
            "but got {} and {}".format(cfg.model.arch.head.num_classes,
                                       len(cfg.class_names)))
    local_rank = int(args.local_rank)
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    mkdir(local_rank, cfg.save_dir)

    logger = NanoDetLightningLogger(cfg.save_dir)
    logger.dump_cfg(cfg)

    if args.seed is not None:
        logger.info("Set random seed to {}".format(args.seed))
        pl.seed_everything(args.seed)

    logger.info("Setting up data...")
    train_dataset = build_dataset(cfg.data.train, "train")
    val_dataset = build_dataset(cfg.data.val, "test")

    evaluator = build_evaluator(cfg.evaluator, val_dataset)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=True,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=naive_collate,
        drop_last=True,
    )
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg.device.batchsize_per_gpu,
        shuffle=False,
        num_workers=cfg.device.workers_per_gpu,
        pin_memory=True,
        collate_fn=naive_collate,
        drop_last=False,
    )

    logger.info("Creating model...")
    task = TrainingTask(cfg, evaluator)

    if "load_model" in cfg.schedule:
        ckpt = torch.load(cfg.schedule.load_model)
        if "pytorch-lightning_version" not in ckpt:
            warnings.warn(
                "Warning! Old .pth checkpoint is deprecated. "
                "Convert the checkpoint with tools/convert_old_checkpoint.py ")
            ckpt = convert_old_model(ckpt)
        load_model_weight(task.model, ckpt, logger)
        logger.info("Loaded model weight from {}".format(
            cfg.schedule.load_model))

    model_resume_path = (os.path.join(cfg.save_dir, "model_last.ckpt")
                         if "resume" in cfg.schedule else None)

    accelerator = None if len(cfg.device.gpu_ids) <= 1 else "ddp"

    trainer = pl.Trainer(
        default_root_dir=cfg.save_dir,
        max_epochs=cfg.schedule.total_epochs,
        gpus=cfg.device.gpu_ids,
        check_val_every_n_epoch=cfg.schedule.val_intervals,
        accelerator=accelerator,
        log_every_n_steps=cfg.log.interval,
        num_sanity_val_steps=0,
        resume_from_checkpoint=model_resume_path,
        callbacks=[ProgressBar(refresh_rate=0)],  # disable tqdm bar
        logger=logger,
        benchmark=True,
        gradient_clip_val=cfg.get("grad_clip", 0.0),
    )

    trainer.fit(task, train_dataloader, val_dataloader)
示例#9
0
    def startNanodetTrain(self):
        #加载配置文件
        load_config(cfg, self.nanoTrainConfig['cfg'])
        #判断分布式训练当中该主机的角色
        local_rank = int(self.nanoTrainConfig["local_rank"])
        # torch.backends.cudnn.enabled = True
        # torch.backends.cudnn.benchmark = True
        mkdir(local_rank, self.nanoTrainConfig["save_dir"])
        logger = Logger(local_rank, self.nanoTrainConfig["save_dir"])
        if self.nanoTrainConfig.keys().__contains__("seed"):
            logger.log('Set random seed to {}'.format(
                self.nanoTrainConfig['seed']))
            self.init_seeds(self.nanoTrainConfig['seed'])

        #1.创建模型
        model = build_model(cfg.model)
        model = model.cpu()

        #2.加载数据
        logger.log('Setting up data...')
        train_dataset = build_dataset(cfg.data.train, 'train',
                                      self.nanoTrainConfig)
        val_dataset = build_dataset(cfg.data.val, 'test', self.nanoTrainConfig)

        if len(cfg.device.gpu_ids) > 1:
            print('rank = ', local_rank)
            num_gpus = torch.cuda.device_count()
            torch.cuda.set_device(local_rank % num_gpus)
            dist.init_process_group(backend='nccl')
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_dataset)
            train_dataloader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=cfg.device.batchsize_per_gpu,
                num_workers=cfg.device.workers_per_gpu,
                pin_memory=True,
                collate_fn=collate_function,
                sampler=train_sampler,
                drop_last=True)
        else:
            print("加载数据...")
            train_dataloader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=cfg.device.batchsize_per_gpu,
                shuffle=True,
                num_workers=cfg.device.workers_per_gpu,
                pin_memory=True,
                collate_fn=collate_function,
                drop_last=True)

        val_dataloader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=1,
            pin_memory=True,
            collate_fn=collate_function,
            drop_last=True)

        trainer = build_trainer(local_rank, cfg, model, logger)

        if 'load_model' in cfg.schedule:
            trainer.load_model(cfg)
        if 'resume' in cfg.schedule:
            trainer.resume(cfg)

        evaluator = build_evaluator(cfg, val_dataset)

        logger.log('Starting training...')
        trainer.run(train_dataloader, val_dataloader, evaluator,
                    self.nanoTrainConfig)
示例#10
0
def run(args):
    """
    :param args:
    :return:
    """
    load_config(cfg, args.config)

    local_rank = int(args.local_rank)  # what's this?
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    mkdir(local_rank, cfg.save_dir)
    logger = Logger(local_rank, cfg.save_dir)

    if args.seed is not None:
        logger.log('Set random seed to {}'.format(args.seed))
        init_seeds(args.seed)

    logger.log('Creating model...')
    model = build_model(cfg.model)

    logger.log('Setting up data...')
    train_dataset = build_dataset(cfg.data.train, 'train')  # build_dataset(cfg.data.train, 'train')
    val_dataset = build_dataset(cfg.data.val, 'test')

    if len(cfg.device.gpu_ids) > 1:  # More than one GPU(distributed training)
        print('rank = ', local_rank)
        num_gpus = torch.cuda.device_count()
        torch.cuda.set_device(local_rank % num_gpus)
        dist.init_process_group(backend='nccl')
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)

        if args.is_debug:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            num_workers=0,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            sampler=train_sampler,
                                                            drop_last=True)
        else:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            num_workers=cfg.device.workers_per_gpu,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            sampler=train_sampler,
                                                            drop_last=True)
    else:
        if args.is_debug:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            shuffle=True,
                                                            num_workers=0,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            drop_last=True)
        else:
            train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                            batch_size=cfg.device.batchsize_per_gpu,
                                                            shuffle=True,
                                                            num_workers=cfg.device.workers_per_gpu,
                                                            pin_memory=True,
                                                            collate_fn=collate_function,
                                                            drop_last=True)

    if args.is_debug:
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                      batch_size=1,
                                                      shuffle=False,
                                                      num_workers=0,
                                                      pin_memory=True,
                                                      collate_fn=collate_function, drop_last=True)
    else:
        val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                      batch_size=1,
                                                      shuffle=False,
                                                      num_workers=1,
                                                      pin_memory=True,
                                                      collate_fn=collate_function, drop_last=True)

    # -----
    trainer = build_trainer(local_rank, cfg, model, logger)

    if 'load_model' in cfg.schedule:
        trainer.load_model(cfg)
    if 'resume' in cfg.schedule:
        trainer.resume(cfg)

    # ----- Build a evaluator
    evaluator = build_evaluator(cfg, val_dataset)
    # evaluator = None

    logger.log('Starting training...')
    trainer.run(train_data_loader, val_data_loader, evaluator)