示例#1
0
def train(cfg):
    logger = setup_logger(name='Train', level=cfg.LOGGER.LEVEL)
    logger.info(cfg)
    model = build_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    #model.to(cuda_device = 'cuda:9')

    criterion = build_loss(cfg)

    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    train_loader = build_data(cfg, is_train=True)
    val_loader = build_data(cfg, is_train=False)

    logger.info(train_loader.dataset)
    logger.info(val_loader.dataset)

    arguments = dict()
    arguments["iteration"] = 0

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    checkpointer = Checkpointer(model, optimizer, scheduler, cfg.SAVE_DIR)

    do_train(cfg, model, train_loader, val_loader, optimizer, scheduler,
             criterion, checkpointer, device, checkpoint_period, arguments,
             logger)
示例#2
0
def main():
    # 读取配置文件
    with open('config/default.yml') as fin:
        config = yaml.load(fin, Loader=yaml.SafeLoader)

    # 生成 train 和 valid 数据集
    train_config = config['dataset']['train']
    train_df = pd.read_csv(train_config['data_path'], sep='\t')
    train_df.sample(frac=1)
    train, valid = train_test_split(train_df,
                                    test_size=config['train_valid_split'])
    train_dataset = build_dataloader(train, train_config, device=device)
    valid_dataset = build_dataloader(valid, train_config, device=device)

    # 建立模型
    model_config = config['model']
    model = BertClassifier(model_config)
    model.to(device)
    optimizer = build_optimizer(model, config['optimizer'])

    # 计算训练步数
    num_train_steps = int(
        len(train_dataset) / train_dataset.batch_size * config['num_epochs'])
    num_warmup_steps = int(num_train_steps *
                           config['optimizer']['warmup_proportion'])
    scheduler = build_scheduler(optimizer, num_train_steps, num_warmup_steps)

    # 训练
    trainer.do_train(model,
                     train_loader=train_dataset,
                     valid_loader=valid_dataset,
                     optimizer=optimizer,
                     scheduler=scheduler,
                     cfg=config)
def train(cfg, local_rank, distributed):

    num_classes = COCODataset(cfg.data.train[0], cfg.data.train[1]).num_classes
    model = EfficientDet(num_classes=num_classes, model_name=cfg.model.name)
    inp_size = model.config['inp_size']
    device = torch.device(cfg.device)
    model.to(device)

    optimizer = build_optimizer(model, **optimizer_kwargs(cfg))
    lr_scheduler = build_lr_scheduler(optimizer, **lr_scheduler_kwargs(cfg))

    use_mixed_precision = cfg.dtype == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            find_unused_parameters=True)

    arguments = {}
    arguments["iteration"] = 0
    output_dir = cfg.output_dir
    save_to_disk = comm.get_rank() == 0
    checkpointer = Checkpointer(model, optimizer, lr_scheduler, output_dir,
                                save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.model.resume)
    arguments.update(extra_checkpoint_data)

    train_dataloader = build_dataloader(cfg,
                                        inp_size,
                                        is_train=True,
                                        distributed=distributed,
                                        start_iter=arguments["iteration"])

    test_period = cfg.test.test_period
    if test_period > 0:
        val_dataloader = build_dataloader(cfg,
                                          inp_size,
                                          is_train=False,
                                          distributed=distributed)
    else:
        val_dataloader = None

    checkpoint_period = cfg.solver.checkpoint_period
    log_period = cfg.solver.log_period

    do_train(cfg, model, train_dataloader, val_dataloader, optimizer,
             lr_scheduler, checkpointer, device, checkpoint_period,
             test_period, log_period, arguments)

    return model
示例#4
0
def train_net(cfg, logger, is_distributed=False, local_rank=0):
    model = build_model(cfg)

    p_sum = 0
    for p in model.named_parameters():
        logger.info('%s, %s, %s' % (p[0], p[1].shape, p[1].requires_grad))
        if p[1].requires_grad:
            p_sum += p[1].numel()

    logger.info('model learnabel parameters: %d\n' % p_sum)

    device = torch.device(cfg.MODEL.DEVICE)
    model = model.to(device)

    if is_distributed:
        model = nn.parallel.DistributedDataParallel(model,
                                                    device_ids=[local_rank],
                                                    output_device=local_rank,
                                                    broadcast_buffers=False)

    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_scheduler(cfg, optimizer)

    arguments = {}
    arguments["epoch"] = 0

    checkpointer = CheckPointer(
        cfg=cfg,
        logger=logger,
        model=model,
        optimizer=optimizer,
        scheduler=lr_scheduler,
        save_dir=cfg.OUTPUT_DIR,
        #is_distributed=is_distributed,
        save_to_disk=get_rank() == 0)

    extra_checkpoint_data = checkpointer.load()

    arguments.update(extra_checkpoint_data)

    criterion = build_loss(cfg)

    train_loader = build_data_loader(cfg, True, is_distributed=is_distributed)
    model = do_train(cfg=cfg,
                     model=model,
                     device=device,
                     train_loader=train_loader,
                     optimizer=optimizer,
                     lr_scheduler=lr_scheduler,
                     criterion=criterion,
                     checkpointer=checkpointer,
                     arguments=arguments,
                     logger=logger)
    return model
示例#5
0
    def __init__(self, *args, **kwargs):
        super(VariationalOptimization, self).__init__(*args, **kwargs)

        # Initialise mean and sd
        if self.cfg.MODEL.POLICY.NETWORK:
            # Set a feedforward network for means
            self.mean = FeedForward(
                self.state_dim,
                self.cfg.MODEL.POLICY.LAYERS,
                self.action_dim
            )
        else:
            # Set tensors for means
            self.mean = Parameter(torch.from_numpy(
                self.initialise_mean(self.cfg.MODEL.POLICY.INITIAL_ACTION_MEAN, self.cfg.MODEL.POLICY.INITIAL_ACTION_SD)
            ))
            self.register_parameter("mean", self.mean)

        # Set tensors for standard deviations
        self.sd = Parameter(torch.from_numpy(self.initialise_sd(self.cfg.MODEL.POLICY.INITIAL_SD)))
        self.initial_clamped_sd = self.clamp_sd(self.sd.detach())
        self.register_parameter("sd", self.sd)
        #self.clamped_sd = np.zeros((self.action_dim, self.horizon), dtype=np.float64)
        self.clamped_action = np.zeros((self.action_dim, self.horizon, self.batch_size), dtype=np.float64)

        # Initialise optimizer
        if self.method == "H":
            # Separate mean and sd optimizers (not sure if actually necessary)
            self.optimizer = {"mean": build_optimizer(self.cfg, self.get_named_parameters("mean")),
                              "sd": build_optimizer(self.cfg, self.get_named_parameters("sd"))}
            self.best_actions = np.empty(self.sd.shape)
            self.best_actions.fill(np.nan)
        else:
            self.optimizer = build_optimizer(self.cfg, self.named_parameters())

        # We need log probabilities for calculating REINFORCE loss
        self.log_prob = torch.empty(self.batch_size, self.horizon, dtype=torch.float64)
示例#6
0
def train(setting_dict):
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger("SSD.trainer")
    logger.setLevel(logging.INFO)
    logger.info("start training....")

    model = SSDdetector(setting_dict=setting_dict["model"])

    ## if you want to fine tune the pretrained model
    ## just change "the path of pretrained model" to your model
    if setting_dict["fine_tune"] :
        checkpoint = torch.load(setting_dict["predtrained_model"], map_location=torch.device("cpu"))
        model_dict = {}
        for key, value in checkpoint.pop("model").items():
            if "backbone" in key:
                model_dict[key.replace("backbone.","")] = value
        model.backbone.load_state_dict(model_dict)
        model.load_state_dict(checkpoint.pop("model"))

        for para in model.backbone.parameters() :
            para.requires_grad = False


    device = torch.device(setting_dict["device"])
    model.to(device)
    lr = setting_dict["solver"]["LR"]


    ## if you want to fine tune the pretrained model
    ## just change model to model.boxhead
    optimizer = build_optimizer(setting_dict["solver"]["optimizer"], model, lr)
    scheduler = build_LRscheduler(setting_dict["solver"]["LRscheduler"])(optimizer,
                                                                         setting_dict["solver"]["LR_STEP"])
    train_loader = make_dataLoader(setting_dict["train"], True)
    test_loader = make_dataLoader(setting_dict["test"], False)
    checkpointer = CheckPoint(model, optimizer, scheduler, "", logger)
    print(setting_dict["train_epoch"])
    for i in range(1,setting_dict["train_epoch"] +1):
        do_train_one_epoch(model,train_loader,optimizer,scheduler,device,setting_dict["out_dir"], i)
        if i % 1 == 0 :
            do_evaluate(model, test_loader, device,setting_dict["out_dir"], i)
        if i % 7 == 0 :
            checkpointer.save(setting_dict["out_dir"]+"/v3_model_{:06d}".format(i))
    checkpointer.save("finial")
    return model
示例#7
0
def train(cfg):
    # prepare dataset
    train_loader, val_loader, test_loader, classes_list = make_data_loader(
        cfg, for_train=True)

    # build model and load parameter
    model = build_model(cfg)
    if cfg.SOLVER.SCHEDULER.RETRAIN_FROM_HEAD == True:
        if cfg.TRAIN.TRICK.PRETRAINED == True:
            model.load_param("Base", cfg.TRAIN.TRICK.PRETRAIN_PATH)
    else:
        if cfg.TRAIN.TRICK.PRETRAINED == True:
            model.load_param("Overall", cfg.TRAIN.TRICK.PRETRAIN_PATH)

    train_loader.dataset.batch_converter = model.backbone_batch_converter
    val_loader.dataset.batch_converter = model.backbone_batch_converter
    test_loader.dataset.batch_converter = model.backbone_batch_converter

    # build loss function
    loss_func, loss_class = build_loss(cfg)
    print('Train with losses:', cfg.LOSS.TYPE)

    # build optimizer (based on model)
    optimizer = build_optimizer(cfg, model,
                                bias_free=cfg.MODEL.BIAS_FREE)  #loss里也可能有参数
    print("Model Bias-Free:{}".format(cfg.MODEL.BIAS_FREE))
    print('Train with the optimizer type is', cfg.SOLVER.OPTIMIZER.NAME)

    # build scheduler (based on optimizer)
    scheduler, start_epoch = build_scheduler(cfg, optimizer)

    # build and launch engine for training
    do_train(
        cfg,
        model,
        train_loader,
        val_loader,
        classes_list,
        optimizer,
        scheduler,
        loss_func,
        start_epoch,
    )
示例#8
0
def train(config, experiment_name=None):
    num_classes = config.MODEL.NUM_CLASSES

    # dataloader for training
    train_period = 'train'
    train_loader = build_dataloader(cfg=config,
                                    period=train_period,
                                    loader_type='train')
    val_loader = build_dataloader(cfg=config,
                                  period=train_period,
                                  loader_type='val')

    # prepare model
    model = build_model(cfg=config)

    print('The loss type is', config.MODEL.LOSS_TYPE)
    loss_func = build_loss(config, num_classes)
    optimizer = build_optimizer(config, model)

    # Add for using self trained model
    if config.MODEL.PRETRAIN_CHOICE == 'self':
        start_epoch = eval(
            config.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_')
            [-1])
        print('Start epoch:', start_epoch)
        path_to_optimizer = config.MODEL.PRETRAIN_PATH.replace(
            'model', 'optimizer')
        print('Path to the checkpoint of optimizer:', path_to_optimizer)
        model.load_state_dict(torch.load(config.MODEL.PRETRAIN_PATH))
        optimizer.load_state_dict(torch.load(path_to_optimizer))

    scheduler = WarmUpMultiStepLR(optimizer, config.SOLVER.STEPS,
                                  config.SOLVER.GAMMA,
                                  config.SOLVER.WARMUP_FACTOR,
                                  config.SOLVER.WARMUP_ITERS,
                                  config.SOLVER.WARMUP_METHOD)

    print('------------------ Start Training -------------------')
    do_train(config, model, train_loader, val_loader, optimizer, scheduler,
             loss_func, experiment_name)
    print('---------------- Training Completed ---------------- ')
示例#9
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu
    args.batch_size_total = args.batch_size * args.world_size
    #rescale base lr
    args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max(
        1, args.batch_size_total // 256))

    # set random seed, make sure all random subgraph generated would be the same
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpu:
        torch.cuda.manual_seed(args.seed)

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging(args.logging_save_path, 'w')

    logger.info(
        f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \
                    gpu per node {ngpus_per_node}, world size {args.world_size}"
    )

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    args.local_rank = args.gpu
    torch.cuda.set_device(args.gpu)

    # build model
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)

    # use sync batchnorm
    if getattr(args, 'sync_bn', False):
        model.apply(lambda m: setattr(m, 'need_sync', True))

    model = comm.get_parallel_model(model, args.gpu)  #local rank

    logger.info(model)

    criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda(
        args.gpu)
    soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max,
                                               args.iw_clip).cuda(args.gpu)

    if not getattr(args, 'inplace_distill', True):
        soft_criterion = None

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)
    args.n_iters_per_epoch = len(train_loader)

    logger.info(f'building optimizer and lr scheduler, \
            local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}'
                )
    optimizer = build_optimizer(args, model)
    lr_scheduler = build_lr_scheduler(args, optimizer)

    # optionally resume from a checkpoint
    if args.resume:
        saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger)

    logger.info(args)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        args.curr_epoch = epoch
        logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0]))

        # train for one epoch
        acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \
                soft_criterion=soft_criterion, lr_scheduler=lr_scheduler)

        if comm.is_master_process() or args.distributed:
            # validate supernet model
            validate(train_loader, val_loader, model, criterion, args)

        if comm.is_master_process():
            # save checkpoints
            saver.save_checkpoint(
                args.checkpoint_save_path,
                model,
                optimizer,
                lr_scheduler,
                args,
                epoch,
            )
示例#10
0
def train(cfg_path, device='cuda'):
    if cfg_path is not None:
        cfg.merge_from_file(cfg_path)
    cfg.freeze()

    if not os.path.isdir(cfg.LOG_DIR):
        os.makedirs(cfg.LOG_DIR)
    if not os.path.isdir(cfg.SAVE_DIR):
        os.makedirs(cfg.SAVE_DIR)

    model = UNet(cfg.NUM_CHANNELS, cfg.NUM_CLASSES)
    model.to(device)

    train_data_loader = build_data_loader(cfg, 'train')
    if cfg.VAL:
        val_data_loader = build_data_loader(cfg, 'val')
    else:
        val_data_loader = None

    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_lr_scheduler(cfg, optimizer)
    criterion = get_loss_func(cfg)
    writer = SummaryWriter(cfg.LOG_DIR)

    iter_counter = 0
    loss_meter = AverageMeter()
    val_loss_meter = AverageMeter()
    min_val_loss = 1e10

    print('Training Start')
    for epoch in range(cfg.SOLVER.MAX_EPOCH):
        print('Epoch {}/{}'.format(epoch + 1, cfg.SOLVER.MAX_EPOCH))
        if lr_scheduler is not None:
            lr_scheduler.step(epoch)
        for data in train_data_loader:
            iter_counter += 1

            imgs, annots = data
            imgs = imgs.to(device)
            annots = annots.to(device)

            y = model(imgs)
            optimizer.zero_grad()
            loss = criterion(y, annots)
            loss.backward()
            optimizer.step()
            loss_meter.update(loss.item())

            if iter_counter % 10 == 0:
                writer.add_scalars('loss', {'train': loss_meter.avg},
                                   iter_counter)
                loss_meter.reset()
            if lr_scheduler is not None:
                writer.add_scalar('learning rate',
                                  optimizer.param_groups[0]['lr'],
                                  iter_counter)
            save_as_checkpoint(model, optimizer,
                               os.path.join(cfg.SAVE_DIR, 'checkpoint.pth'),
                               epoch, iter_counter)

        # Skip validation when cfg.VAL is False
        if val_data_loader is None:
            continue

        for data in val_data_loader:
            val_loss_meter.reset()
            with torch.no_grad():
                imgs, annots = data
                imgs = imgs.to(device)
                annots = annots.to(device)

                y = model(imgs)
                optimizer.zero_grad()
                loss = criterion(y, annots)
                val_loss_meter.update(loss.item())
        if val_loss_meter.avg < min_val_loss:
            min_val_loss = val_loss_meter.avg
            writer.add_scalars('loss', {'val': val_loss_meter.avg},
                               iter_counter)
            # save model if validation loss is minimum
            torch.save(model.state_dict(),
                       os.path.join(cfg.SAVE_DIR, 'min_val_loss.pth'))
示例#11
0
def train(is_dist, start_epoch, local_rank):
    transforms = transform.build_transforms()
    coco_dataset = dataset.COCODataset(is_train=True, transforms=transforms)
    if (is_dist):
        sampler = distributedGroupSampler(coco_dataset)
    else:
        sampler = groupSampler(coco_dataset)
    dataloader = build_dataloader(coco_dataset, sampler)

    batch_time_meter = utils.AverageMeter()
    cls_loss_meter = utils.AverageMeter()
    reg_loss_meter = utils.AverageMeter()
    losses_meter = utils.AverageMeter()

    model = retinanet(is_train=True)
    if (start_epoch == 1):
        model.resnet.load_pretrained(pretrained_path[cfg.resnet_depth])
    else:
        utils.load_model(model, start_epoch - 1)
    model = model.cuda()

    if is_dist:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[
                local_rank,
            ],
            output_device=local_rank,
            broadcast_buffers=False)
    optimizer = solver.build_optimizer(model)
    scheduler = solver.scheduler(optimizer)

    model.train()
    logs = []

    for epoch in range(start_epoch, cfg.max_epochs + 1):
        if is_dist:
            dataloader.sampler.set_epoch(epoch - 1)
        scheduler.lr_decay(epoch)

        end_time = time.time()
        for iteration, datas in enumerate(dataloader, 1):
            scheduler.linear_warmup(epoch, iteration - 1)
            images = datas["images"]
            bboxes = datas["bboxes"]
            labels = datas["labels"]
            res_img_shape = datas["res_img_shape"]
            pad_img_shape = datas["pad_img_shape"]

            images = images.cuda()
            bboxes = [bbox.cuda() for bbox in bboxes]
            labels = [label.cuda() for label in labels]

            loss_dict = model(images,
                              gt_bboxes=bboxes,
                              gt_labels=labels,
                              res_img_shape=res_img_shape,
                              pad_img_shape=pad_img_shape)
            cls_loss = loss_dict["cls_loss"]
            reg_loss = loss_dict["reg_loss"]

            losses = cls_loss + reg_loss
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            batch_time_meter.update(time.time() - end_time)
            end_time = time.time()

            cls_loss_meter.update(cls_loss.item())
            reg_loss_meter.update(reg_loss.item())
            losses_meter.update(losses.item())

            if (iteration % 50 == 0):
                if (local_rank == 0):
                    res = "\t".join([
                        "Epoch: [%d/%d]" % (epoch, cfg.max_epochs),
                        "Iter: [%d/%d]" % (iteration, len(dataloader)),
                        "Time: %.3f (%.3f)" %
                        (batch_time_meter.val, batch_time_meter.avg),
                        "cls_loss: %.4f (%.4f)" %
                        (cls_loss_meter.val, cls_loss_meter.avg),
                        "reg_loss: %.4f (%.4f)" %
                        (reg_loss_meter.val, reg_loss_meter.avg),
                        "Loss: %.4f (%.4f)" %
                        (losses_meter.val, losses_meter.avg),
                        "lr: %.6f" % (optimizer.param_groups[0]["lr"]),
                    ])
                    print(res)
                    logs.append(res)
                batch_time_meter.reset()
                cls_loss_meter.reset()
                reg_loss_meter.reset()
                losses_meter.reset()
        if (local_rank == 0):
            utils.save_model(model, epoch)
        if (is_dist):
            utils.synchronize()

    if (local_rank == 0):
        with open("logs.txt", "w") as f:
            for i in logs:
                f.write(i + "\n")
示例#12
0

if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description='Training GlamPoints detector')
    parser.add_argument('--path_ymlfile',
                        type=str,
                        default='configs/glampoints_training.yml',
                        help='Path to yaml file.')

    opt = parser.parse_args()

    with open(opt.path_ymlfile, 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

    _device = settings.initialize_cuda_and_logging(cfg)

    train_loader, val_loader = make_data_loader(cfg)

    model = build_model(cfg)
    model.to(_device)

    optimizer = build_optimizer(cfg, model)

    loss_func = build_loss(cfg)

    logger, tb_logger = build_logger(cfg)

    do_train(cfg, model, train_loader, val_loader, optimizer, loss_func,
             logger, tb_logger, _device)