示例#1
0
def train(cfg):
    # Set up environment.
    init_distributed_training(cfg)
    local_rank_id = get_local_rank()

    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED + 10 * local_rank_id)
    torch.manual_seed(cfg.RNG_SEED + 10 * local_rank_id)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)
    logger.info('init start')
    # 迭代轮数从1开始计数
    arguments = {"cur_epoch": 1}

    device = get_device(local_rank_id)
    model = build_recognizer(cfg, device)
    criterion = build_criterion(cfg, device)
    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = CheckPointer(model,
                                optimizer=optimizer,
                                scheduler=lr_scheduler,
                                save_dir=cfg.OUTPUT_DIR,
                                save_to_disk=True)
    if cfg.TRAIN.RESUME:
        logger.info('resume start')
        extra_checkpoint_data = checkpointer.load(map_location=device)
        if isinstance(extra_checkpoint_data, dict):
            arguments['cur_epoch'] = extra_checkpoint_data['cur_epoch']
            if cfg.LR_SCHEDULER.IS_WARMUP:
                logger.info('warmup start')
                if lr_scheduler.finished:
                    optimizer.load_state_dict(
                        lr_scheduler.after_scheduler.optimizer.state_dict())
                else:
                    optimizer.load_state_dict(
                        lr_scheduler.optimizer.state_dict())
                lr_scheduler.optimizer = optimizer
                lr_scheduler.after_scheduler.optimizer = optimizer
                logger.info('warmup end')
        logger.info('resume end')

    data_loader = build_dataloader(cfg, is_train=True)

    logger.info('init end')
    synchronize()
    do_train(cfg, arguments, data_loader, model, criterion, optimizer,
             lr_scheduler, checkpointer, device)
示例#2
0
文件: fusion_test.py 项目: ZJCV/TSM
def test(args):
    torch.backends.cudnn.benchmark = True
    logger = logging.setup_logging()
    device = torch.device(f'cuda:0' if torch.cuda.is_available() else 'cpu')
    map_location = {'cuda:%d' % 0: 'cuda:%d' % 0}

    # # 计算RGB
    rgb_cfg = get_cfg_defaults()
    rgb_cfg.merge_from_file(args.rgb_config_file)
    rgb_cfg.DATALOADER.TEST_BATCH_SIZE = 16
    rgb_cfg.OUTPUT.DIR = args.output
    rgb_cfg.freeze()

    rgb_model = build_model(rgb_cfg, 0)
    rgb_model.eval()
    checkpointer = CheckPointer(rgb_model, logger=logger)
    checkpointer.load(args.rgb_pretrained, map_location=map_location)

    # inference(rgb_cfg, rgb_model, device)

    # 计算RGBDiff
    rgbdiff_cfg = get_cfg_defaults()
    rgbdiff_cfg.merge_from_file(args.rgbdiff_config_file)
    rgbdiff_cfg.DATALOADER.TEST_BATCH_SIZE = 16
    rgbdiff_cfg.OUTPUT.DIR = args.output
    rgbdiff_cfg.freeze()

    rgbdiff_model = build_model(rgbdiff_cfg, 0)
    rgbdiff_model.eval()
    checkpointer = CheckPointer(rgbdiff_model, logger=logger)
    checkpointer.load(args.rgbdiff_pretrained, map_location=map_location)

    inference(rgb_cfg, rgb_model, rgbdiff_cfg, rgbdiff_model, device)
示例#3
0
文件: test.py 项目: ZJCV/SlowFast
def test(cfg):
    torch.backends.cudnn.benchmark = True

    logger = setup_logger('TEST')
    device = torch.device(f'cuda:0' if torch.cuda.is_available() else 'cpu')
    map_location = {'cuda:%d' % 0: 'cuda:%d' % 0}

    model = build_model(cfg, map_location=map_location).to(device)
    if cfg.MODEL.PRETRAINED != "":
        if logger:
            logger.info(f'load pretrained: {cfg.MODEL.PRETRAINED}')
        checkpointer = CheckPointer(model, logger=logger)
        checkpointer.load(cfg.MODEL.PRETRAINED, map_location=map_location)

    do_evaluation(cfg, model, device)
示例#4
0
def train(gpu, args, cfg):
    rank = args.nr * args.gpus + gpu
    setup(rank, args.world_size)

    logger = setup_logger(cfg.TRAIN.NAME)
    arguments = {"iteration": 0}

    torch.cuda.set_device(gpu)
    device = torch.device(f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu')
    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}

    model = build_model(cfg, gpu, map_location=map_location)
    criterion = build_criterion(cfg)
    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = CheckPointer(model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=cfg.OUTPUT.DIR,
                                save_to_disk=True, logger=logger)
    if args.resume:
        if is_master_proc():
            logger.info('resume ...')
        extra_checkpoint_data = checkpointer.load(map_location=map_location, rank=rank)
        if extra_checkpoint_data != dict():
            arguments['iteration'] = extra_checkpoint_data['iteration']
            if cfg.LR_SCHEDULER.IS_WARMUP:
                if is_master_proc():
                    logger.info('warmup ...')
                if lr_scheduler.finished:
                    optimizer.load_state_dict(lr_scheduler.after_scheduler.optimizer.state_dict())
                else:
                    optimizer.load_state_dict(lr_scheduler.optimizer.state_dict())
                lr_scheduler.optimizer = optimizer
                lr_scheduler.after_scheduler.optimizer = optimizer

    data_loader = build_dataloader(cfg, is_train=True, start_iter=arguments['iteration'])

    synchronize()
    do_train(args, cfg, arguments,
             data_loader, model, criterion, optimizer, lr_scheduler,
             checkpointer, device, logger)
    cleanup()
示例#5
0
文件: build.py 项目: ZJCV/Non-local
def build_recognizer(cfg, device):
    world_size = du.get_world_size()

    model = registry.RECOGNIZER[cfg.MODEL.RECOGNIZER.NAME](cfg).to(
        device=device)

    if cfg.MODEL.NORM.SYNC_BN and world_size > 1:
        logger.info("start sync BN on the process group of {}".format(
            du._LOCAL_RANK_GROUP))
        convert_sync_bn(model, du._LOCAL_PROCESS_GROUP)
    if cfg.MODEL.PRETRAINED != "":
        logger.info(f'load pretrained: {cfg.MODEL.PRETRAINED}')
        checkpointer = CheckPointer(model)
        checkpointer.load(cfg.MODEL.PRETRAINED, map_location=device)
        logger.info("finish loading model weights")

    if du.get_world_size() > 1:
        model = DDP(model,
                    device_ids=[device],
                    output_device=device,
                    find_unused_parameters=True)

    return model
示例#6
0
文件: build.py 项目: ZJCV/SlowFast
def build_model(cfg, gpu, map_location=None, logger=None):
    model = registry.RECOGNIZER[cfg.MODEL.RECOGNIZER.NAME](
        cfg, map_location=map_location).cuda(gpu)

    world_size = du.get_world_size()
    rank = du.get_rank()
    if cfg.MODEL.SYNC_BN and world_size > 1:
        process_group = simple_group_split(world_size, rank, 1)
        convert_sync_bn(model, process_group, gpu=gpu)
    if cfg.MODEL.PRETRAINED != "":
        if du.is_master_proc() and logger:
            logger.info(f'load pretrained: {cfg.MODEL.PRETRAINED}')
        checkpointer = CheckPointer(model, logger=logger)
        checkpointer.load(cfg.MODEL.PRETRAINED,
                          map_location=map_location,
                          rank=rank)

    if du.get_world_size() > 1:
        model = DDP(model,
                    device_ids=[gpu],
                    output_device=gpu,
                    find_unused_parameters=True)

    return model
示例#7
0
文件: train.py 项目: ZJCV/TRN
def train(gpu, args, cfg):
    rank = args.nr * args.gpus + gpu
    setup(rank, args.world_size, args.gpus)

    logger = setup_logger(cfg.TRAIN.NAME)
    arguments = {"iteration": 0}
    arguments['rank'] = rank

    device = torch.device(
        f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu')
    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
    model = build_model(cfg, map_location=map_location).to(device)
    if cfg.MODEL.PRETRAINED != "":
        if rank == 0 and logger:
            logger.info(f'load pretrained: {cfg.MODEL.PRETRAINED}')
        checkpointer = CheckPointer(model, logger=logger)
        checkpointer.load(cfg.MODEL.PRETRAINED,
                          map_location=map_location,
                          rank=rank)

    if args.gpus > 1:
        model = DDP(model, device_ids=[gpu], find_unused_parameters=True)
    criterion = build_criterion(cfg)
    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = CheckPointer(model,
                                optimizer=optimizer,
                                scheduler=lr_scheduler,
                                save_dir=cfg.OUTPUT.DIR,
                                save_to_disk=True,
                                logger=logger)
    if args.resume:
        if rank == 0:
            logger.info('resume ...')
        extra_checkpoint_data = checkpointer.load(map_location=map_location,
                                                  rank=rank)
        if extra_checkpoint_data != dict():
            arguments['iteration'] = extra_checkpoint_data['iteration']
            if cfg.LR_SCHEDULER.WARMUP:
                if rank == 0:
                    logger.info('warmup ...')
                if lr_scheduler.finished:
                    optimizer.load_state_dict(
                        lr_scheduler.after_scheduler.optimizer.state_dict())
                else:
                    optimizer.load_state_dict(
                        lr_scheduler.optimizer.state_dict())
                lr_scheduler.optimizer = optimizer
                lr_scheduler.after_scheduler.optimizer = optimizer

    data_loader = build_dataloader(cfg,
                                   train=True,
                                   start_iter=arguments['iteration'],
                                   world_size=args.world_size,
                                   rank=rank)

    model = do_train(args, cfg, arguments, data_loader, model, criterion,
                     optimizer, lr_scheduler, checkpointer, device, logger)

    if rank == 0 and not args.stop_eval:
        logger.info('Start final evaluating...')
        torch.cuda.empty_cache()  # speed up evaluating after training finished
        do_evaluation(cfg, model, device)

    cleanup()