def main():
    global args, train_writer, test_writer
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    device = torch.device("cuda" if args.cuda else "cpu")

    # tensorboard logging
    train_writer = SummaryWriter(comment='train')
    test_writer = SummaryWriter(comment='test')

    # dataset
    num_class, img_dim, train_loader, test_loader = get_setting(args)

    # model
    #     A, B, C, D = 64, 8, 16, 16
    A, B, C, D = 32, 32, 32, 32
    model = capsules(A=A,
                     B=B,
                     C=C,
                     D=D,
                     E=num_class,
                     iters=args.em_iters,
                     add_decoder=args.add_decoder,
                     img_dim=img_dim).to(device)

    print("Number of trainable parameters: {}".format(
        sum(param.numel() for param in model.parameters())))
    criterion = CapsuleLoss(alpha=args.alpha,
                            mode='bce',
                            num_class=num_class,
                            add_decoder=args.add_decoder)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)

    best_loss, best_score = test(test_loader, model, criterion, 0, device)
    for epoch in range(1, args.epochs + 1):
        scores = train(train_loader, model, criterion, optimizer, epoch,
                       device)

        if epoch % args.test_intvl == 0:
            test_loss, test_score = test(test_loader, model, criterion,
                                         epoch * len(train_loader), device)
            if test_loss < best_loss or test_score > best_score:
                snapshot(model, args.snapshot_folder, epoch)
            best_loss = min(best_loss, test_loss)
            best_score = max(best_score, test_score)
    print('best test score: {:.6f}'.format(best_score))

    train_writer.close()
    test_writer.close()

    # save end model
    snapshot(model, args.snapshot_folder, 'end_{}'.format(args.epochs))
예제 #2
0
def train(epochNum):
    writer = SummaryWriter('../log/' + date +
                           '/ResNet50/')  # 创建 /log/日期/ResNet50的组织形式
    train_dataset, val_dataset = CropDataset.split_Dataset(
        data_dir, train_val_ratio, IMAGE_SIZE, trian_transform, val_transform)
    train_dataLoader = DataLoader(train_dataset,
                                  BATCH_SIZE,
                                  num_workers=16,
                                  shuffle=True)
    val_dataLoader = DataLoader(val_dataset,
                                BATCH_SIZE,
                                num_workers=1,
                                shuffle=False)
    model = getmodel()
    criterion = nn.CrossEntropyLoss().cuda()
    min_loss = 4.1
    print('min_loss is :%f' % (min_loss))
    min_acc = 0.80
    patience = 0
    lr = 0.0
    momentum = 0.0
    for epoch in range(epochNum):
        print('Epoch {}/{}'.format(epoch, epochNum - 1))
        print('-' * 10)

        #第一轮首先训练全连接层
        if epoch == 0 or epoch == 1 or epoch == 2:
            lr = 1e-3
            optimizer = torch.optim.Adam(model.fresh_params(),
                                         lr=lr,
                                         amsgrad=True,
                                         weight_decay=1e-4)
        else:
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=lr,
                                         amsgrad=True,
                                         weight_decay=1e-4)
        if epoch == 3:
            lr = 1e-3
            momentum = 0.9
            print('set lr=:%f,momentum=%f' % (lr, momentum))
        if patience == 2 and lr == 1e-3:
            patience = 0
            model.load_state_dict(
                torch.load('../model/ResNet50/' + date +
                           '_loss_best.pth')['state_dict'])
            lr = lr / 10
            print('loss has increased lr divide 10 lr now is :%f' % (lr))
        if patience == 2 and lr == 1e-4:
            patience = 0
            epochNum = epoch + 1

        # 保存训练过程中的loss和acc
        running_loss = utils.RunningMean()
        running_corrects = utils.RunningMean()

        for batch_idx, (inputs, labels) in enumerate(train_dataLoader):
            model.train(True)  # 模型进入训练模式
            n_batchsize = inputs.size(0)
            optimizer.zero_grad()  # 清空所有参数的梯度
            inputs = inputs.cuda()
            labels = labels.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss.update(loss.item(), 1)  # 将这一个batch的loss保存起来
            _, preds = torch.max(outputs.data, 1)
            running_corrects.update(
                torch.sum(preds == labels.data).data,
                n_batchsize)  # 将这个batch的准确度保存起来
            loss.backward()
            optimizer.step()

            # 每10个batch显示一次训练结果信息
            if batch_idx % 10 == 9:
                print('(%s)[epoch:%d,batch:%d]:acc: %f,loss:%f' %
                      (str(datetime.datetime.now()), epoch, batch_idx,
                       running_corrects.value, running_loss.value))
                niter = epoch * len(train_dataset) / BATCH_SIZE + batch_idx
                writer.add_scalar('Train/Acc', running_corrects.value, niter)
                writer.add_scalar('Train/Loss', running_loss.value, niter)
                # 如果batch大于300,则每300个batch进行一次验证
                if batch_idx % 300 == 299:
                    lx, px = utils.predict(model, val_dataLoader)
                    log_loss = criterion(px, lx)
                    log_loss = log_loss.item()
                    _, preds = torch.max(px, dim=1)
                    accuracy = torch.mean((preds == lx).float())
                    writer.add_scalar('Val/Acc', accuracy, niter)
                    writer.add_scalar('Val/Loss', log_loss, niter)
                    print(
                        '(%s)[epoch:%d,batch:%d]: val_acc:%f,val_loss:%f,val_total_len:%d'
                        % (epoch, batch_idx, accuracy, log_loss,
                           len(val_dataset)))
        print('(%s)[epoch:%d] :acc: %f,loss:%f,lr:%f,patience:%d' %
              (str(datetime.datetime.now()), epoch, running_corrects.value,
               running_loss.value, lr, patience))

        # 训练完后进行验证集验证
        lx, px = utils.predict(model, val_dataLoader)
        log_loss = criterion(px, lx)
        log_loss = log_loss.item()
        _, preds = torch.max(px, dim=1)
        accuracy = torch.mean((preds == lx).float())
        writer.add_scalar('Val/Acc', accuracy,
                          (epoch + 1) * len(train_dataset) / BATCH_SIZE)
        writer.add_scalar('Val/Loss', log_loss,
                          (epoch + 1) * len(train_dataset) / BATCH_SIZE)
        print('(%s)[epoch:%d]: val_acc:%f,val_loss:%f,' %
              (str(datetime.datetime.now()), epoch, accuracy, log_loss))

        # 若验证集误差小于设定的min_loss,则保存模型快照
        if log_loss < min_loss:
            try:
                fileName = date + '_loss_best.pth'
                utils.snapshot(
                    '../model/ResNet50/', fileName, {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'val_loss': log_loss,
                        'val_correct': accuracy
                    })
                patience = 0
                min_loss = log_loss
                print('save new model loss,now loss is ', min_loss)
            except IOError:
                print("Error: 没有找到文件或读取文件失败")
        else:
            patience += 1

        # 若精确度大于设定的min+acc,则保存模型快照
        if accuracy > min_acc:
            try:
                fileName = date + '_acc_best.pth'
                utils.snapshot(
                    '../model/ResNet50/', fileName, {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'val_loss': log_loss,
                        'val_correct': accuracy
                    })
                min_acc = accuracy
                print('save new model acc,now acc is ', min_acc.item())
            except IOError:
                print("Error: 没有找到文件或读取文件失败")
예제 #3
0
def trainWithRawData(path, epochNum):
    try:
        print('[+] loading modelParams...', end='', flush=True)
        modelParams = torch.load(path)
        print('Done')
    except IOError:
        print("Error: 没有找到文件或读取文件失败")
    writer = SummaryWriter('../log/' + date +
                           '/ResNet50/')  # 创建 /log/日期/ResNet50的组织形式
    train_dataset, val_dataset = CropDataset.split_Dataset(
        data_dir, train_val_ratio, IMAGE_SIZE, trian_transform, val_transform)
    train_dataLoader = DataLoader(train_dataset,
                                  BATCH_SIZE,
                                  num_workers=16,
                                  shuffle=True)
    val_dataLoader = DataLoader(val_dataset,
                                BATCH_SIZE,
                                num_workers=1,
                                shuffle=False)
    model = getmodel()
    criterion = nn.CrossEntropyLoss().cuda()
    model.load_state_dict(modelParams['state_dict'])
    min_loss = modelParams['val_loss']
    print('val_correct is %f' % (modelParams['val_correct']))
    print('min_loss is :%f' % (min_loss))
    min_acc = max(modelParams['val_correct'], 0.81)
    optinizerSave = modelParams['optimizer']
    patience = 0
    lr = 1e-4
    momentum = 0.9
    for epoch in range(epochNum):
        print('Epoch {}/{}'.format(epoch, epochNum - 1))
        print('-' * 10)
        if patience == 3:
            patience = 0
            model.load_state_dict(
                torch.load('../model/ResNet50/' + date +
                           '_loss_best.pth')['state_dict'])
            lr = lr / 5
            print('loss has increased , lr now is :%f' % (lr))
            optimizer = torch.optim.SGD(params=model.parameters(),
                                        lr=lr,
                                        momentum=0.9)
        else:
            optimizer = torch.optim.SGD(params=model.parameters(),
                                        lr=lr,
                                        momentum=0.9)

        # 保存训练过程中的loss和acc
        running_loss = utils.RunningMean()
        running_corrects = utils.RunningMean()

        for batch_idx, (inputs, labels) in enumerate(train_dataLoader):
            model.train(True)
            n_batchsize = inputs.size(0)
            inputs = inputs.cuda()
            labels = labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            _, preds = torch.max(outputs.data, 1)
            loss = criterion(outputs, labels)
            running_loss.update(loss.item(), 1)
            running_corrects.update(
                torch.sum(preds == labels.data).data, n_batchsize)
            loss.backward()
            optimizer.step()
            # 每10个batch显示一次训练结果信息
            if batch_idx % 10 == 9:
                print('(%s)[epoch:%d,batch:%d]:acc: %f,loss:%f' %
                      (str(datetime.datetime.now()), epoch, batch_idx,
                       running_corrects.value, running_loss.value))
                niter = epoch * len(train_dataset) / BATCH_SIZE + batch_idx
                writer.add_scalar('Train/Acc', running_corrects.value, niter)
                writer.add_scalar('Train/Loss', running_loss.value, niter)
                # 如果batch大于300,则每300个batch进行一次验证
                if batch_idx % 300 == 299:
                    lx, px = utils.predict(model, val_dataLoader)
                    log_loss = criterion(px, lx)
                    log_loss = log_loss.item()
                    _, preds = torch.max(px, dim=1)
                    accuracy = torch.mean((preds == lx).float())
                    writer.add_scalar('Val/Acc', accuracy, niter)
                    writer.add_scalar('Val/Loss', log_loss, niter)
                    print(
                        '(%s)[epoch:%d,batch:%d]: val_acc:%f,val_loss:%f,val_total_len:%d'
                        % (epoch, batch_idx, accuracy, log_loss,
                           len(val_dataset)))
        print('(%s)[epoch:%d] :acc: %f,loss:%f,lr:%f,patience:%d' %
              (str(datetime.datetime.now()), epoch, running_corrects.value,
               running_loss.value, lr, patience))

        # 训练完后进行验证集验证
        lx, px = utils.predict(model, val_dataLoader)
        log_loss = criterion(px, lx)
        log_loss = log_loss.item()
        _, preds = torch.max(px, dim=1)
        accuracy = torch.mean((preds == lx).float())
        writer.add_scalar('Val/Acc', accuracy,
                          (epoch + 1) * len(train_dataset) / BATCH_SIZE)
        writer.add_scalar('Val/Loss', log_loss,
                          (epoch + 1) * len(train_dataset) / BATCH_SIZE)
        print('(%s)[epoch:%d]: val_acc:%f,val_loss:%f,' %
              (str(datetime.datetime.now()), epoch, accuracy, log_loss))

        # 若验证集误差小于设定的min_loss,则保存模型快照
        if log_loss < min_loss:
            try:
                fileName = date + '_loss_best.pth'
                utils.snapshot(
                    '../model/ResNet50/', fileName, {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'val_loss': log_loss,
                        'val_correct': accuracy
                    })
                patience = 0
                min_loss = log_loss
                print('save new model loss,now loss is ', min_loss)
            except IOError:
                print("Error: 没有找到文件或读取文件失败")
        else:
            patience += 1

        # 若精确度大于设定的min+acc,则保存模型快照
        if accuracy > min_acc:
            try:
                fileName = date + '_acc_best.pth'
                utils.snapshot(
                    '../model/ResNet50/', fileName, {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'val_loss': log_loss,
                        'val_correct': accuracy
                    })
                min_acc = accuracy
                print('save new model acc,now acc is ', min_acc.item())
            except IOError:
                print("Error: 没有找到文件或读取文件失败")
예제 #4
0
    DICT ={"ran_epoch":epoch,"n_current_steps":optimizer.n_current_steps,"delta":optimizer.delta} if type(optimizer) == ScheduledOptim else {"ran_epoch":epoch}
    POPEN.update_ini_file(DICT,logger)
    
    
#    -----------| compare the result |-----------
    if (best_loss > val_total_loss) :
        # update best performance
        best_loss = min(best_loss,val_total_loss)
        best_acc = max(best_acc,val_avg_acc)
        best_epoch = epoch
        
        # save
        utils.snapshot(POPEN.vae_pth_path, {
                    'epoch': epoch + 1,
                    'validation_acc': val_avg_acc,
                    # 'state_dict': model.state_dict(),
                    'state_dict': model,
                    'validation_loss': val_total_loss,
                    'optimizer': optimizer.state_dict(),
                })
        
        # update the popen
        POPEN.update_ini_file({'run_name':run_name,
                            "ran_epoch":epoch,
                            "best_acc":best_acc},
                            logger)
        
    elif (epoch - best_epoch >= 30)&((type(optimizer) == ScheduledOptim)):
        optimizer.increase_delta()
        
    elif (epoch - best_epoch >= 60)&(epoch > POPEN.max_epoch/2):
        # at the late phase of training
예제 #5
0
def main():
    global args
    args = parser.parse_args()

    print()
    print('Command-line argument values:')
    for key, value in vars(args).items():
        print('-', key, ':', value)
    print()

    params = [
        args.model,
        path_to_save_string(args.dataset), args.viewpoint_modulo,
        args.batch_size, args.epochs, args.lr, args.weight_decay, args.seed,
        args.routing_iters
    ]
    model_name = '_'.join([str(x) for x in params]) + '.pth'
    header = 'model,dataset,viewpoint_modulo,batch_size,epochs,lr,weight_decay,seed,em_iters,accuracy'
    snapshot_path = os.path.join('.', 'snapshots', model_name)
    data_path = os.path.join('.', 'results', 'training_data', model_name)
    result_path = os.path.join('.', 'results', 'pytorch_train.csv')

    make_dirs_if_not_exist([snapshot_path, data_path, result_path])

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    model, criterion, optimizer, scheduler = load_model(
        args.model,
        device_ids=args.device_ids,
        lr=args.lr,
        routing_iters=args.routing_iters)
    num_class, train_loader, test_loader = load_datasets(
        args.dataset, args.batch_size, args.test_batch_size,
        args.viewpoint_modulo)

    best_acc = 0
    training_accuracies = []
    test_accuracies = []

    if args.append:
        model.load_state_dict(torch.load(snapshot_path))
    try:
        for epoch in range(1, args.epochs + 1):
            print()
            acc = train(train_loader,
                        model,
                        criterion,
                        optimizer,
                        epoch,
                        epochs=args.epochs,
                        log_interval=args.log_interval)
            training_accuracies.append(acc)
            scheduler.step(acc)
            print('Epoch accuracy was %.1f%%. Learning rate is %.9f.' %
                  (acc, optimizer.state_dict()['param_groups'][0]['lr']))
            if epoch % args.test_interval == 0:
                test_acc, __, __, __ = test(test_loader,
                                            model,
                                            criterion,
                                            chunk=args.test_size)
                test_accuracies.append(test_acc)
                if test_acc > best_acc:
                    best_acc = test_acc
    except KeyboardInterrupt:
        print('Cancelled training after %d epochs' % (epoch - 1))
        args.epochs = epoch - 1

    acc, predictions, labels, logits = test(test_loader,
                                            model,
                                            criterion,
                                            chunk=1)
    print(f'Accuracy: {acc:.2f}% (best: {best_acc:.2f}%)')

    to_write = params + [acc.cpu().numpy()]
    append_to_csv(result_path, to_write, header=header)
    snapshot(snapshot_path, model)
    #torch.save((accuracies, labels, predictions), data_path)

    if args.learn_curve != '':
        make_dirs_if_not_exist(args.learn_curve)
        torch.save((training_accuracies, test_accuracies), args.learn_curve)
예제 #6
0
    # print some metrics
    train_samples_size = len(train_loader) * BATCH_SIZE
    valid_samples_size = len(valid_loader) * BATCH_SIZE
    loss_train_epoch = loss_train / train_samples_size
    loss_valid_epoch = loss_valid / valid_samples_size
    error_train_epoch = 100 - 100 * (acc_train / train_samples_size)
    error_valid_epoch = 100 - 100 * (acc_valid / valid_samples_size)
    error_history.append((error_train_epoch, error_valid_epoch))
    loss_history.append((loss_train_epoch, loss_valid_epoch))
    print(
        'Epoch: {} train loss: {:.5f} valid loss: {:.5f} train error: {:.2f} % valid error: {:.2f} %'
        .format(epoch, loss_train_epoch, loss_valid_epoch, error_train_epoch,
                error_valid_epoch))

    # check if model is better
    if error_valid_epoch < best_error[1]:
        best_error = (epoch, error_valid_epoch)
        snapshot(SAVED_MODELS_DIR, RUN_TIME, RUN_NAME, True, epoch,
                 error_valid_epoch, model.state_dict(),
                 model.optimizer.state_dict())

    # check that the model is not doing worst over the time
    if best_error[0] + PATIENCE < epoch:
        print('Overfitting. Stopped at epoch {}.'.format(epoch))
        break
    epoch += 1

    plot_loss(RUN_TIME, RUN_NAME, loss_history)
    plot_error(RUN_TIME, RUN_NAME, error_history)
예제 #7
0
def train():
    # Load data and prepare training samples
    numpyImages, numpyGT = load_data()
    dataQueue = Queue(30)  # max 50 images in queue
    dataPreparation = [None] * cfg.nProc

    # thread creation
    for proc in range(cfg.nProc):
        dataPreparation[proc] = Process(target=prepare_data_thread,
                                        args=(dataQueue, numpyImages, numpyGT))
        dataPreparation[proc].daemon = True
        dataPreparation[proc].start()

    def data_gen():
        for _ in range(cfg.numIterations * cfg.batchSize):
            defImg, defLab, _ = dataQueue.get()
            yield defImg, defLab

    print("Load data.")
    # tensorflow data loader
    h, w, d = params["VolSize"]
    dataset = tf.data.Dataset.from_generator(
        data_gen, (tf.float32, tf.int32),
        (tf.TensorShape([h, w, d, 1]), tf.TensorShape([h, w, d])))
    dataset = dataset.batch(batch_size=cfg.batchSize)

    print("Build model.")
    # build model
    model = vnet.VNet([h, w, d, 1], cfg.batchSize, cfg.ncls)
    learning_rate = cfg.baseLR
    learning_rate = K.optimizers.schedules.ExponentialDecay(
        learning_rate, cfg.decay_steps, cfg.decay_rate, True)
    optim = K.optimizers.SGD(learning_rate, momentum=0.99)
    criterion = K.losses.SparseCategoricalCrossentropy(from_logits=True)

    @tf.function
    def train_step(x, y):
        # Forward
        with tf.GradientTape() as tape:
            prediction = model(x)
            losses = criterion(y, prediction)
        # Backward
        with tf.name_scope("Gradients"):
            gradients = tape.gradient(losses, model.trainable_variables)
        optim.apply_gradients(zip(gradients, model.trainable_variables))
        return losses, prediction

    # File writer
    writer, logdir = utils.summary_writer(cfg)
    # Trace graph
    tf.summary.trace_on(graph=True)
    train_step(tf.zeros([1, h, w, d, 1]),
               tf.zeros([1, h, w, d]))  # dry run for tracing graph (step=1)
    tf.summary.trace_export("OpGraph", 0)

    print("Start training.")
    save_path = logdir / "snapshots"
    total_loss = 0
    dice = None
    for trImg, trLab in dataset:
        loss, pred = train_step(trImg, trLab)
        step = optim.iterations.numpy()  # (step start from 2)
        loss_val = loss.numpy()

        # Loss moving average
        total_loss = loss_val if step < 5 else \
            cfg.moving_average * total_loss + (1 - cfg.moving_average) * loss_val

        # Logging
        if (step < 500 and step % 10 == 0) or step % cfg.log_interval == 0:
            dice = utils.compute_dice(trLab, pred)
            print(f"Step: {step}, Loss: {loss_val:.4f}, Dice: {dice:.4f}, "
                  f"LR: {learning_rate(step).numpy():.2E}")

            # Summary scalars and images
            tf.summary.scalar("loss", total_loss, step=step)
            tf.summary.scalar("dice", dice, step=step)
            tf.summary.image("trImg", trImg[..., d // 2, :], step=step)
            tf.summary.image("pred", pred[..., d // 2, :], step=step)

        # Take snapshots
        if step == 2 or step % cfg.snap_shot_interval == 0:
            filepath = utils.snapshot(model, save_path, step)
            print(f"Model weights saved (Path: {filepath}).")

    # Ending
    filepath = utils.snapshot(model, save_path, optim.iterations.numpy())
    print(f"Model weights saved ({filepath}).\nTraining ended.")
    writer.close()
예제 #8
0
    for batch_i, (sent, length, label) in enumerate(train_loader):
        sent = sent.to(train_device)
        length = length.to(train_device)
        label = label.to(train_device)
        pred = model(sent, length)
        loss = criterion(pred, label)
        # compute gradient and do optimizer step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f'INFO: [{epoch:02d}/{batch_i:04d}], avgloss: {loss.item():.4f}',
              end=', ')
        # training acc
        train_acc.update((pred, label))
        pr.update((pred, label))
        print(f'Training acc: {train_acc.compute() * 100:.2f}%')

    # testing acc
    model.eval()
    model.to(torch.device('cpu'))
    for batch_i, (sent, length, label) in enumerate(test_loader):
        with torch.no_grad():
            pred = model(sent, length)
            test_acc.update((pred, label))
    print(f'              Testing acc: {test_acc.compute() * 100:.2f}%')

    # saving models
    if test_acc.compute() > 0.9:
        snapshot(model, epoch, args.save_path)
p, r = pr.compute()
print(f'P: {p} \n R: {r}')