Пример #1
0
def start_train():
    '''
    训练
    '''
    use_amp = True
    # 前向反传N次,再更新参数  目的:增大batch(理论batch= batch_size * N)
    iter_size = 8

    myNet = MyNet(use_amp).to("cuda:0")
    myNet = torch.nn.DataParallel(myNet, device_ids=[0, 1])  # 数据并行
    myNet.train()
    # 训练开始前初始化 梯度缩放器
    scaler = GradScaler() if use_amp else None

    # 加载预训练权重
    if resume_train:
        scaler.load_state_dict(checkpoint['scaler'])  # amp自动混合精度用到
        optimizer.load_state_dict(checkpoint['optimizer'])
        myNet.load_state_dict(checkpoint["model"])

    for epoch in range(1, 100):
        for batch_idx, (input, target) in enumerate(dataloader_train):

            # 数据 转到每个并行模型的主卡上
            input = input.to("cuda:0")
            target = target.to("cuda:0")

            # 自动混合精度训练
            if use_amp:
                # 自动广播 将支持半精度操作自动转为FP16
                with autocast():
                    # 提取特征
                    feature = myNet(input)
                    losses = loss_function(target, feature)
                    loss = losses / iter_size
                scaler.scale(loss).backward()
            else:
                feature = myNet(input, target)
                losses = loss_function(target, feature)
                loss = losses / iter_size
                loss.backward()

            # 梯度累积,再更新参数
            if (batch_idx + 1) % iter_size == 0:
                # 梯度更新
                if use_amp:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()
                # 梯度清零
                optimizer.zero_grad()
        # scaler 具有状态。恢复训练时需要加载
        state = {
            'net': myNet.state_dict(),
            'optimizer': optimizer.state_dict(),
            'scaler': scaler.state_dict()
        }
        torch.save(state, "filename.pth")
Пример #2
0
        inds_sim = inds_sim.cuda()
        inds_scr = inds_scr.cuda()
        target_scr = target_scr.cuda()

    target_scr = Variable(target_scr)
    # set minLabels
    args.minLabels = len(mask_inds)

# train
model = MyNet(profile["count"], args.nChannel, args.nConv)

if use_cuda:

    model.cuda()

model.train()

# similarity loss definition
loss_fn = torch.nn.CrossEntropyLoss()

# scribble loss definition
loss_fn_scr = torch.nn.CrossEntropyLoss()

# continuity loss definition
loss_hpy = torch.nn.L1Loss(size_average=True)
loss_hpz = torch.nn.L1Loss(size_average=True)

HPy_target = torch.zeros(im.shape[1] - 1, im.shape[2], args.nChannel)
HPz_target = torch.zeros(im.shape[1], im.shape[2] - 1, args.nChannel)

if use_cuda:
Пример #3
0
def do_train(data_path,
             model_name='mymodel',
             use_gpu=False,
             epoch_num=5,
             batch_size=100,
             learning_rate=0.01):
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    with fluid.dygraph.guard(place):
        model = MyNet()
        model.train()
        train_loader = load_data(data_path, mode='train')

        optimizer = fluid.optimizer.SGDOptimizer(
            learning_rate=learning_rate, parameter_list=model.parameters())

        iter = 0
        for epoch_id in range(epoch_num):
            for batch_id, data in enumerate(train_loader()):
                #准备数据,格式需要转换成符合框架要求的
                image_data, label_data = data
                # 将数据转为飞桨动态图格式
                image = fluid.dygraph.to_variable(image_data)
                label = fluid.dygraph.to_variable(label_data)

                # #前向计算的过程
                # predict = model(image)
                #前向计算的过程,同时拿到模型输出值和分类准确率
                predict, avg_acc = model(image, label)

                #计算损失,取一个批次样本损失的平均值
                # loss = fluid.layers.square_error_cost(predict, label)
                loss = fluid.layers.cross_entropy(predict, label)
                avg_loss = fluid.layers.mean(loss)

                #每训练了1000批次的数据,打印下当前Loss的情况
                if batch_id != 0 and batch_id % 100 == 0:
                    print(
                        "epoch: {}, batch: {}, loss is: {}, acc is: {}".format(
                            epoch_id, batch_id, avg_loss.numpy(),
                            avg_acc.numpy()))
                    log_writer.add_scalar(tag='acc',
                                          step=iter,
                                          value=avg_acc.numpy())
                    log_writer.add_scalar(tag='loss',
                                          step=iter,
                                          value=avg_loss.numpy())
                    iter = iter + 100

                #后向传播,更新参数的过程
                avg_loss.backward()
                optimizer.minimize(avg_loss)
                model.clear_gradients()

            fluid.save_dygraph(
                model.state_dict(),
                os.path.join(CHECKPOINT_PATH,
                             f'{model_name}_epoch_{epoch_id}'))
            fluid.save_dygraph(
                optimizer.state_dict(),
                os.path.join(CHECKPOINT_PATH,
                             f'{model_name}_epoch_{epoch_id}'))

        # 保存模型
        fluid.save_dygraph(model.state_dict(),
                           os.path.join(MODEL_PATH, model_name))