Exemplo n.º 1
0
def train(epochNum):
    writer = SummaryWriter('../log/' + date +
                           '/ResNet50/')  # 创建 /log/日期/ResNet50的组织形式
    train_dataset, val_dataset = CropDataset.split_Dataset(
        data_dir, train_val_ratio, IMAGE_SIZE, trian_transform, val_transform)
    train_dataLoader = DataLoader(train_dataset,
                                  BATCH_SIZE,
                                  num_workers=16,
                                  shuffle=True)
    val_dataLoader = DataLoader(val_dataset,
                                BATCH_SIZE,
                                num_workers=1,
                                shuffle=False)
    model = getmodel()
    criterion = nn.CrossEntropyLoss().cuda()
    min_loss = 4.1
    print('min_loss is :%f' % (min_loss))
    min_acc = 0.80
    patience = 0
    lr = 0.0
    momentum = 0.0
    for epoch in range(epochNum):
        print('Epoch {}/{}'.format(epoch, epochNum - 1))
        print('-' * 10)

        #第一轮首先训练全连接层
        if epoch == 0 or epoch == 1 or epoch == 2:
            lr = 1e-3
            optimizer = torch.optim.Adam(model.fresh_params(),
                                         lr=lr,
                                         amsgrad=True,
                                         weight_decay=1e-4)
        else:
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=lr,
                                         amsgrad=True,
                                         weight_decay=1e-4)
        if epoch == 3:
            lr = 1e-3
            momentum = 0.9
            print('set lr=:%f,momentum=%f' % (lr, momentum))
        if patience == 2 and lr == 1e-3:
            patience = 0
            model.load_state_dict(
                torch.load('../model/ResNet50/' + date +
                           '_loss_best.pth')['state_dict'])
            lr = lr / 10
            print('loss has increased lr divide 10 lr now is :%f' % (lr))
        if patience == 2 and lr == 1e-4:
            patience = 0
            epochNum = epoch + 1

        # 保存训练过程中的loss和acc
        running_loss = utils.RunningMean()
        running_corrects = utils.RunningMean()

        for batch_idx, (inputs, labels) in enumerate(train_dataLoader):
            model.train(True)  # 模型进入训练模式
            n_batchsize = inputs.size(0)
            optimizer.zero_grad()  # 清空所有参数的梯度
            inputs = inputs.cuda()
            labels = labels.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss.update(loss.item(), 1)  # 将这一个batch的loss保存起来
            _, preds = torch.max(outputs.data, 1)
            running_corrects.update(
                torch.sum(preds == labels.data).data,
                n_batchsize)  # 将这个batch的准确度保存起来
            loss.backward()
            optimizer.step()

            # 每10个batch显示一次训练结果信息
            if batch_idx % 10 == 9:
                print('(%s)[epoch:%d,batch:%d]:acc: %f,loss:%f' %
                      (str(datetime.datetime.now()), epoch, batch_idx,
                       running_corrects.value, running_loss.value))
                niter = epoch * len(train_dataset) / BATCH_SIZE + batch_idx
                writer.add_scalar('Train/Acc', running_corrects.value, niter)
                writer.add_scalar('Train/Loss', running_loss.value, niter)
                # 如果batch大于300,则每300个batch进行一次验证
                if batch_idx % 300 == 299:
                    lx, px = utils.predict(model, val_dataLoader)
                    log_loss = criterion(px, lx)
                    log_loss = log_loss.item()
                    _, preds = torch.max(px, dim=1)
                    accuracy = torch.mean((preds == lx).float())
                    writer.add_scalar('Val/Acc', accuracy, niter)
                    writer.add_scalar('Val/Loss', log_loss, niter)
                    print(
                        '(%s)[epoch:%d,batch:%d]: val_acc:%f,val_loss:%f,val_total_len:%d'
                        % (epoch, batch_idx, accuracy, log_loss,
                           len(val_dataset)))
        print('(%s)[epoch:%d] :acc: %f,loss:%f,lr:%f,patience:%d' %
              (str(datetime.datetime.now()), epoch, running_corrects.value,
               running_loss.value, lr, patience))

        # 训练完后进行验证集验证
        lx, px = utils.predict(model, val_dataLoader)
        log_loss = criterion(px, lx)
        log_loss = log_loss.item()
        _, preds = torch.max(px, dim=1)
        accuracy = torch.mean((preds == lx).float())
        writer.add_scalar('Val/Acc', accuracy,
                          (epoch + 1) * len(train_dataset) / BATCH_SIZE)
        writer.add_scalar('Val/Loss', log_loss,
                          (epoch + 1) * len(train_dataset) / BATCH_SIZE)
        print('(%s)[epoch:%d]: val_acc:%f,val_loss:%f,' %
              (str(datetime.datetime.now()), epoch, accuracy, log_loss))

        # 若验证集误差小于设定的min_loss,则保存模型快照
        if log_loss < min_loss:
            try:
                fileName = date + '_loss_best.pth'
                utils.snapshot(
                    '../model/ResNet50/', fileName, {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'val_loss': log_loss,
                        'val_correct': accuracy
                    })
                patience = 0
                min_loss = log_loss
                print('save new model loss,now loss is ', min_loss)
            except IOError:
                print("Error: 没有找到文件或读取文件失败")
        else:
            patience += 1

        # 若精确度大于设定的min+acc,则保存模型快照
        if accuracy > min_acc:
            try:
                fileName = date + '_acc_best.pth'
                utils.snapshot(
                    '../model/ResNet50/', fileName, {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'val_loss': log_loss,
                        'val_correct': accuracy
                    })
                min_acc = accuracy
                print('save new model acc,now acc is ', min_acc.item())
            except IOError:
                print("Error: 没有找到文件或读取文件失败")
Exemplo n.º 2
0
def main(settings):
    print("start processig with settings", settings)
    utils.set_seed(settings['seed'])

    global elapsed_time

    # create the experiment folders
    logdir = os.path.join(settings['logdir'], settings['method'],
                          settings['dataset'], utils.get_runname(settings))
    pathlib.Path(logdir).mkdir(parents=True, exist_ok=True)

    # prepare
    train_set = utils.dataset_from_name(split='train', **settings)
    val_set = utils.dataset_from_name(split='val', **settings)
    test_set = utils.dataset_from_name(split='test', **settings)

    train_loader = data.DataLoader(train_set,
                                   settings['batch_size'],
                                   shuffle=True,
                                   num_workers=settings['num_workers'])
    val_loader = data.DataLoader(val_set,
                                 settings['batch_size'],
                                 shuffle=True,
                                 num_workers=settings['num_workers'])
    test_loader = data.DataLoader(test_set, settings['batch_size'],
                                  settings['num_workers'])

    objectives = from_name(settings.pop('objectives'), train_set.task_names())
    scores = from_objectives(objectives)

    rm1 = utils.RunningMean(400)
    rm2 = utils.RunningMean(400)

    method = method_from_name(objectives=objectives, **settings)

    train_results = dict(settings=settings,
                         num_parameters=utils.num_parameters(
                             method.model_params()))
    val_results = dict(settings=settings,
                       num_parameters=utils.num_parameters(
                           method.model_params()))
    test_results = dict(settings=settings,
                        num_parameters=utils.num_parameters(
                            method.model_params()))

    with open(pathlib.Path(logdir) / "settings.json", "w") as file:
        json.dump(train_results, file)

    # main
    for j in range(settings['num_starts']):
        train_results[f"start_{j}"] = {}
        val_results[f"start_{j}"] = {}
        test_results[f"start_{j}"] = {}

        optimizer = torch.optim.Adam(method.model_params(), settings['lr'])
        if settings['use_scheduler']:
            scheduler = torch.optim.lr_scheduler.MultiStepLR(
                optimizer,
                settings['scheduler_milestones'],
                gamma=settings['scheduler_gamma'])

        for e in range(settings['epochs']):
            print(f"Epoch {e}")
            tick = time.time()
            method.new_epoch(e)

            for b, batch in enumerate(train_loader):
                batch = utils.dict_to_cuda(batch)
                optimizer.zero_grad()
                stats = method.step(batch)
                optimizer.step()

                loss, sim = stats if isinstance(stats, tuple) else (stats, 0)
                print(
                    "Epoch {:03d}, batch {:03d}, train_loss {:.4f}, sim {:.4f}, rm train_loss {:.3f}, rm sim {:.3f}"
                    .format(e, b, loss, sim, rm1(loss), rm2(sim)))

            tock = time.time()
            elapsed_time += (tock - tick)

            if settings['use_scheduler']:
                val_results[f"start_{j}"][f"epoch_{e}"] = {
                    'lr': scheduler.get_last_lr()[0]
                }
                scheduler.step()

            # run eval on train set (mainly for debugging)
            if settings['train_eval_every'] > 0 and (
                    e + 1) % settings['train_eval_every'] == 0:
                train_results = evaluate(
                    j,
                    e,
                    method,
                    scores,
                    train_loader,
                    logdir,
                    reference_point=settings['reference_point'],
                    split='train',
                    result_dict=train_results)

            if settings['eval_every'] > 0 and (
                    e + 1) % settings['eval_every'] == 0:
                # Validation results
                val_results = evaluate(
                    j,
                    e,
                    method,
                    scores,
                    val_loader,
                    logdir,
                    reference_point=settings['reference_point'],
                    split='val',
                    result_dict=val_results)

                # Test results
                test_results = evaluate(
                    j,
                    e,
                    method,
                    scores,
                    test_loader,
                    logdir,
                    reference_point=settings['reference_point'],
                    split='test',
                    result_dict=test_results)

            # Checkpoints
            if settings['checkpoint_every'] > 0 and (
                    e + 1) % settings['checkpoint_every'] == 0:
                pathlib.Path(os.path.join(logdir,
                                          'checkpoints')).mkdir(parents=True,
                                                                exist_ok=True)
                torch.save(
                    method.model.state_dict(),
                    os.path.join(logdir, 'checkpoints',
                                 'c_{}-{:03d}.pth'.format(j, e)))

        print("epoch_max={}, val_volume_max={}".format(epoch_max, volume_max))
        pathlib.Path(os.path.join(logdir, 'checkpoints')).mkdir(parents=True,
                                                                exist_ok=True)
        torch.save(
            method.model.state_dict(),
            os.path.join(logdir, 'checkpoints',
                         'c_{}-{:03d}.pth'.format(j, 999999)))
    return volume_max
Exemplo n.º 3
0
def trainWithRawData(path, epochNum):
    try:
        print('[+] loading modelParams...', end='', flush=True)
        modelParams = torch.load(path)
        print('Done')
    except IOError:
        print("Error: 没有找到文件或读取文件失败")
    writer = SummaryWriter('../log/' + date +
                           '/ResNet50/')  # 创建 /log/日期/ResNet50的组织形式
    train_dataset, val_dataset = CropDataset.split_Dataset(
        data_dir, train_val_ratio, IMAGE_SIZE, trian_transform, val_transform)
    train_dataLoader = DataLoader(train_dataset,
                                  BATCH_SIZE,
                                  num_workers=16,
                                  shuffle=True)
    val_dataLoader = DataLoader(val_dataset,
                                BATCH_SIZE,
                                num_workers=1,
                                shuffle=False)
    model = getmodel()
    criterion = nn.CrossEntropyLoss().cuda()
    model.load_state_dict(modelParams['state_dict'])
    min_loss = modelParams['val_loss']
    print('val_correct is %f' % (modelParams['val_correct']))
    print('min_loss is :%f' % (min_loss))
    min_acc = max(modelParams['val_correct'], 0.81)
    optinizerSave = modelParams['optimizer']
    patience = 0
    lr = 1e-4
    momentum = 0.9
    for epoch in range(epochNum):
        print('Epoch {}/{}'.format(epoch, epochNum - 1))
        print('-' * 10)
        if patience == 3:
            patience = 0
            model.load_state_dict(
                torch.load('../model/ResNet50/' + date +
                           '_loss_best.pth')['state_dict'])
            lr = lr / 5
            print('loss has increased , lr now is :%f' % (lr))
            optimizer = torch.optim.SGD(params=model.parameters(),
                                        lr=lr,
                                        momentum=0.9)
        else:
            optimizer = torch.optim.SGD(params=model.parameters(),
                                        lr=lr,
                                        momentum=0.9)

        # 保存训练过程中的loss和acc
        running_loss = utils.RunningMean()
        running_corrects = utils.RunningMean()

        for batch_idx, (inputs, labels) in enumerate(train_dataLoader):
            model.train(True)
            n_batchsize = inputs.size(0)
            inputs = inputs.cuda()
            labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            _, preds = torch.max(outputs.data, 1)
            loss = criterion(outputs, labels)
            running_loss.update(loss.item(), 1)
            running_corrects.update(
                torch.sum(preds == labels.data).data, n_batchsize)
            loss.backward()
            optimizer.step()
            # 每10个batch显示一次训练结果信息
            if batch_idx % 10 == 9:
                print('(%s)[epoch:%d,batch:%d]:acc: %f,loss:%f' %
                      (str(datetime.datetime.now()), epoch, batch_idx,
                       running_corrects.value, running_loss.value))
                niter = epoch * len(train_dataset) / BATCH_SIZE + batch_idx
                writer.add_scalar('Train/Acc', running_corrects.value, niter)
                writer.add_scalar('Train/Loss', running_loss.value, niter)
                # 如果batch大于300,则每300个batch进行一次验证
                if batch_idx % 300 == 299:
                    lx, px = utils.predict(model, val_dataLoader)
                    log_loss = criterion(px, lx)
                    log_loss = log_loss.item()
                    _, preds = torch.max(px, dim=1)
                    accuracy = torch.mean((preds == lx).float())
                    writer.add_scalar('Val/Acc', accuracy, niter)
                    writer.add_scalar('Val/Loss', log_loss, niter)
                    print(
                        '(%s)[epoch:%d,batch:%d]: val_acc:%f,val_loss:%f,val_total_len:%d'
                        % (epoch, batch_idx, accuracy, log_loss,
                           len(val_dataset)))
        print('(%s)[epoch:%d] :acc: %f,loss:%f,lr:%f,patience:%d' %
              (str(datetime.datetime.now()), epoch, running_corrects.value,
               running_loss.value, lr, patience))

        # 训练完后进行验证集验证
        lx, px = utils.predict(model, val_dataLoader)
        log_loss = criterion(px, lx)
        log_loss = log_loss.item()
        _, preds = torch.max(px, dim=1)
        accuracy = torch.mean((preds == lx).float())
        writer.add_scalar('Val/Acc', accuracy,
                          (epoch + 1) * len(train_dataset) / BATCH_SIZE)
        writer.add_scalar('Val/Loss', log_loss,
                          (epoch + 1) * len(train_dataset) / BATCH_SIZE)
        print('(%s)[epoch:%d]: val_acc:%f,val_loss:%f,' %
              (str(datetime.datetime.now()), epoch, accuracy, log_loss))

        # 若验证集误差小于设定的min_loss,则保存模型快照
        if log_loss < min_loss:
            try:
                fileName = date + '_loss_best.pth'
                utils.snapshot(
                    '../model/ResNet50/', fileName, {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'val_loss': log_loss,
                        'val_correct': accuracy
                    })
                patience = 0
                min_loss = log_loss
                print('save new model loss,now loss is ', min_loss)
            except IOError:
                print("Error: 没有找到文件或读取文件失败")
        else:
            patience += 1

        # 若精确度大于设定的min+acc,则保存模型快照
        if accuracy > min_acc:
            try:
                fileName = date + '_acc_best.pth'
                utils.snapshot(
                    '../model/ResNet50/', fileName, {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'val_loss': log_loss,
                        'val_correct': accuracy
                    })
                min_acc = accuracy
                print('save new model acc,now acc is ', min_acc.item())
            except IOError:
                print("Error: 没有找到文件或读取文件失败")