Exemplo n.º 1
0
def train(cfg):
    model = build_model(cfg)
    # model.fix_bn()
    model = nn.DataParallel(model)
    torch.backends.cudnn.benchmark = True
    optimizer = make_optimizer(cfg, model)
    criterion = make_criterion(cfg)
    scheduler = make_lr_scheduler(cfg, optimizer)
    metrics = make_metrics(cfg)
    train_loader = make_dataloader(cfg, is_train=True)
    val_loader = make_dataloader(cfg, is_train=False)

    cfg.TOOLS.image_n = 3
    #image_3_dataloader = make_inference_dataloader(cfg=cfg)
    image_3_dataloader = None
    cfg.TOOLS.image_n = 4
    #image_4_dataloader = make_inference_dataloader(cfg=cfg)
    image_4_dataloader = None

    do_train(cfg,
             model=model,
             train_loader=train_loader,
             val_loader=val_loader,
             optimizer=optimizer,
             scheduler=scheduler,
             loss_fn=criterion,
             metrics=metrics,
             image_3_dataloader=image_3_dataloader,
             image_4_dataloader=image_4_dataloader)
Exemplo n.º 2
0
def run_train(args):

    device = torch.device(args.device)
    # build student
    student = build_model(args.student, args.num_classes, args.pretrained)
    student = student.to(device)
    # build teachers
    teachers = build_teachers(args, device)
    # build checkpointer, optimizer, scheduler, logger
    optimizer = build_optimizer(args, student)
    scheduler = build_lr_scheduler(args, optimizer)
    checkpointer = Checkpointer(student, optimizer, scheduler, args.experiment, args.checkpoint_period)
    logger = Logger(os.path.join(args.experiment, 'tf_log'))

    # objective function to train student
    loss_fn = loss_fn_kd

    # data_load
    train_loader = CIFAR10_loader(args, is_train=True)
    test_loader = CIFAR10_loader(args, is_train=False)

    acc1, m_acc1 = inference(student, test_loader, logger, device, 0, args)
    checkpointer.best_acc = acc1
    for epoch in tqdm(range(0, args.max_epoch)):
        do_train(student, teachers, loss_fn, train_loader, optimizer, checkpointer, device, logger, epoch)
        acc1, m_acc1 = inference(student, test_loader, logger, device, epoch+1, args)
        if acc1 > checkpointer.best_acc:
            checkpointer.save("model_best")
            checkpointer.best_acc = acc1
        scheduler.step()
    
    checkpointer.save("model_last")
def train(cfg, local_rank, distributed):

    num_classes = COCODataset(cfg.data.train[0], cfg.data.train[1]).num_classes
    model = EfficientDet(num_classes=num_classes, model_name=cfg.model.name)
    inp_size = model.config['inp_size']
    device = torch.device(cfg.device)
    model.to(device)

    optimizer = build_optimizer(model, **optimizer_kwargs(cfg))
    lr_scheduler = build_lr_scheduler(optimizer, **lr_scheduler_kwargs(cfg))

    use_mixed_precision = cfg.dtype == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            find_unused_parameters=True)

    arguments = {}
    arguments["iteration"] = 0
    output_dir = cfg.output_dir
    save_to_disk = comm.get_rank() == 0
    checkpointer = Checkpointer(model, optimizer, lr_scheduler, output_dir,
                                save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.model.resume)
    arguments.update(extra_checkpoint_data)

    train_dataloader = build_dataloader(cfg,
                                        inp_size,
                                        is_train=True,
                                        distributed=distributed,
                                        start_iter=arguments["iteration"])

    test_period = cfg.test.test_period
    if test_period > 0:
        val_dataloader = build_dataloader(cfg,
                                          inp_size,
                                          is_train=False,
                                          distributed=distributed)
    else:
        val_dataloader = None

    checkpoint_period = cfg.solver.checkpoint_period
    log_period = cfg.solver.log_period

    do_train(cfg, model, train_dataloader, val_dataloader, optimizer,
             lr_scheduler, checkpointer, device, checkpoint_period,
             test_period, log_period, arguments)

    return model
Exemplo n.º 4
0
def train(cfg):
    # prepare dataset
    train_loader, val_loader, test_loader, classes_list = make_data_loader(
        cfg, for_train=True)

    # build model and load parameter
    model = build_model(cfg)
    if cfg.SOLVER.SCHEDULER.RETRAIN_FROM_HEAD == True:
        if cfg.TRAIN.TRICK.PRETRAINED == True:
            model.load_param("Base", cfg.TRAIN.TRICK.PRETRAIN_PATH)
    else:
        if cfg.TRAIN.TRICK.PRETRAINED == True:
            model.load_param("Overall", cfg.TRAIN.TRICK.PRETRAIN_PATH)

    train_loader.dataset.batch_converter = model.backbone_batch_converter
    val_loader.dataset.batch_converter = model.backbone_batch_converter
    test_loader.dataset.batch_converter = model.backbone_batch_converter

    # build loss function
    loss_func, loss_class = build_loss(cfg)
    print('Train with losses:', cfg.LOSS.TYPE)

    # build optimizer (based on model)
    optimizer = build_optimizer(cfg, model,
                                bias_free=cfg.MODEL.BIAS_FREE)  #loss里也可能有参数
    print("Model Bias-Free:{}".format(cfg.MODEL.BIAS_FREE))
    print('Train with the optimizer type is', cfg.SOLVER.OPTIMIZER.NAME)

    # build scheduler (based on optimizer)
    scheduler, start_epoch = build_scheduler(cfg, optimizer)

    # build and launch engine for training
    do_train(
        cfg,
        model,
        train_loader,
        val_loader,
        classes_list,
        optimizer,
        scheduler,
        loss_func,
        start_epoch,
    )
    model = build_model(cfg,pretrain_path=arg['load_path'])

    optimizer = make_optimizer(cfg['optimizer'],model)
    lr_scheduler = wrapper_lr_scheduler(cfg['lr_scheduler'],optimizer)

    if arg['device']: # 传入命令指定 device id
        free_device_ids = arg['device']
    else:
        free_device_ids = get_free_device_ids()

    max_num_devices = cfg['max_num_devices']
    if len(free_device_ids)>=max_num_devices:
        free_device_ids = free_device_ids[:max_num_devices]

    master_device = free_device_ids[0]
    model.cuda(master_device)
    model = nn.DataParallel(model,device_ids=free_device_ids).cuda(master_device)


    if cfg['enable_backends_cudnn_benchmark']:
        print("enable backends cudnn benchmark")
        torch.backends.cudnn.benchmark = True



    cfg_copy['save_dir'] = save_dir # 更新存储目录
    cfg_copy['log_dir'] = log_dir # 更新存储目录
    # import pdb; pdb.set_trace()
    do_train(cfg_copy,model=model,train_loader=train_dataloader,val_loader=None,optimizer=optimizer,
                scheduler=lr_scheduler,metrics=None,device=free_device_ids)
Exemplo n.º 6
0
    # 待修改
    config_file = arg["CONFIG_FILE"]
    config_file = config_file.replace("../","").replace(".py","").replace('/','.')
    exec(r"from {} import config as cfg".format(config_file))
    # if arg['MODEL.LOAD_PATH'] != None: #优先级:arg传入命令 >model中存的cfg > config_file
    #     cfg = torch.load(arg['MODEL.LOAD_PATH'])['cfg']
    cfg = merage_from_arg(cfg,arg)


    train_dataloader = make_dataloader(cfg['train_pipeline'])
    model = bulid_model(cfg['model'],cfg['pretrain'])
    criterion = make_criterion(cfg['criterion'])
    optimizer = make_optimizer(cfg['optimizer'],model)
    lr_scheduler = make_lr_scheduler(cfg['lr_scheduler'],optimizer)

    if cfg['enable_swa']:  # enable swa,swa需在lr_scheduler之后启动
        optimizer = torchcontrib.optim.SWA(optimizer)
        # optimizer = DP_SWA(optimizer)

    if cfg['multi_gpu']:
        # model = nn.DataParallel(model,device_ids=cfg['device_ids'])
        device_ids=cfg['device_ids']
        model = DataParallel_withLoss(model,criterion,device_ids=device_ids)

    if cfg['enable_backends_cudnn_benchmark']:
        print("enable backends cudnn benchmark")
        torch.backends.cudnn.benchmark = True
    
    do_train(cfg,model=model,train_loader=train_dataloader,val_loader=None,optimizer=optimizer,
                scheduler=lr_scheduler,loss_fn=criterion,metrics=None)