示例#1
0
    def train_epoch(self):
        current_loss = 0
        train_accuracy = 0
        epoch_start = time.time()

        for step, sample_batched in enumerate(self.train_dataloader):

            with torch.set_grad_enabled(True):
                output = self.model(sample_batched)
                loss = self.loss(output,
                                 self.get_targets(sample_batched["label"]))
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                current_loss += loss.detach()
                train_accuracy += get_acc(
                    output, self.get_targets(sample_batched["label"]))

            if step % self.train_log_frequency == 0 and step > 0:
                current_loss /= self.train_log_frequency * self.train_batch_size
                train_accuracy /= self.train_log_frequency * self.train_batch_size

                self.writer.add_scalar(
                    "Train/loss",
                    current_loss,
                    (self.epoch - 1) * len(self.train_dataloader) + step,
                )
                self.writer.add_scalar(
                    "Train/accuracy",
                    train_accuracy,
                    (self.epoch - 1) * len(self.train_dataloader) + step,
                )

                print(
                    f"> epoch: {self.epoch} | step: {step} | loss: {loss} | train_accuracy: {train_accuracy} | epoch_training_time: {time.time() - epoch_start} s"
                )

                current_loss = 0
                train_accuracy = 0

        print(
            f"=> Train epoch {self.epoch} finished in {time.time() - epoch_start} s"
        )
示例#2
0
    def validate(self):
        validation_start = time.time()
        validation_accuracy = 0
        self.model.eval()  # Set model to evaluate mode

        # Iterate over data.
        for step, sample_batched in enumerate(self.dev_dataloader):
            with torch.set_grad_enabled(False):
                output = self.model(sample_batched)
                validation_accuracy += get_acc(
                    output, self.get_targets_dev(sample_batched["label"]))

        validation_accuracy /= len(self.dev_dataloader) * self.dev_batch_size
        self.writer.add_scalar("Validation/accuracy", validation_accuracy,
                               self.epoch)

        print(
            f"=> Validation epoch {self.epoch} | validation_accuracy: {validation_accuracy} | validation_time: {time.time() - validation_start} ms"
        )

        if validation_accuracy > self.best_validation_accuracy:
            torch.save(self.model, f"{self.save_dir}/{self.epoch}_siamese.pth")
            self.best_validation_accuracy = validation_accuracy
            print("====> New best model saved!")
def main(mode=None):
    global name, logger

    #Tag_ResidualBlocks_BatchSize
    name = "my_log"
    logger = SummaryWriter("runs/" + name)

    cat_dir = "D:/Codewyf/AI/data/datasets/test/cat_test/"
    dog_dir = "D:/Codewyf/AI/data/datasets/test/dog_test/"

    config = load_config(mode)

    torch.manual_seed(config.SEED)  #为CPU设计种子用于生成随机数,以使得结果是确定的
    torch.cuda.manual_seed(config.SEED)  #为GPU设置随机种子,可以保证每次初始化相同
    np.random.seed(config.SEED)
    random.seed(config.SEED)

    train_set = ImageFolder(config.TRAIN_PATH, transform=train_tf)  #设置训练路径
    length_train = len(train_set)  #return the number of items in a container
    train_data = torch.utils.data.DataLoader(train_set,
                                             batch_size=config.BATCH_SIZE,
                                             shuffle=True)  #
    iter_per_epoch = len(train_data)  #return the number of per epoch

    test_set = ImageFolder(config.TEST_PATH, transform=test_tf)
    length_test = len(test_set)
    test_data = torch.utils.data.DataLoader(test_set,
                                            batch_size=config.BATCH_SIZE,
                                            shuffle=True)

    cat_test_set = ImageFolder(cat_dir, transform=test_tf)
    length_test = len(test_set)
    cat_test_data = torch.utils.data.DataLoader(test_set,
                                                batch_size=config.BATCH_SIZE,
                                                shuffle=True)

    dog_test_set = ImageFolder(dog_dir, transform=test_tf)
    length_test = len(test_set)
    dog_test_data = torch.utils.data.DataLoader(test_set,
                                                batch_size=config.BATCH_SIZE,
                                                shuffle=True)

    # INIT GPU初始化GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU)
    if torch.cuda.is_available():
        config.DEVICE = torch.device("cuda")
        print('\nGPU IS AVAILABLE')
        torch.backends.cudnn.benchmark = True
    else:
        config.DEVICE = torch.device("cpu")

    # choose network选择一个网络
    net = resnet18().to(config.DEVICE)  #使用resnet18
    print('The Model is ResNet18\n')

    # optimizer and loss function       优化和损失函数
    optimizer = optim.SGD(
        net.parameters(), lr=config.LR, momentum=0.9,
        weight_decay=5e-4)  #Stochastic Gradient Descent随机梯度下降
    loss_function = nn.CrossEntropyLoss()  #交叉熵损失函数

    # warmup
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=config.MILESTONES,
        gamma=0.5)  #调整学习率learning rate
    # milestons是数组,gamma是倍数,LR初始值为0.01,当milestones达到所设置的3,6,9时,lr的数值乘以gamma,即倍数
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * config.WARM)

    # create checkpoint folder to save model
    model_path = os.path.join(config.PATH, 'model')
    if not os.path.exists(model_path):
        os.mkdir(model_path)
    checkpoint_path = os.path.join(model_path, '{epoch}-{type}.pth')

    best_acc = 0.0
    a = config.EPOCH

    for epoch in range(1, config.EPOCH):

        if epoch > config.WARM:
            train_scheduler.step(epoch)

        ### train ###
        net.train()  #在训练前加上
        train_loss = 0.0  # cost function error
        train_correct = 0.0

        for i, data in enumerate(train_data):
            steps = len(train_data) * (epoch - 1) + i  #计算训练到了第多少步
            if epoch <= config.WARM:
                warmup_scheduler.step()

            length = len(train_data)
            image, label = data
            image, label = image.to(config.DEVICE), label.to(config.DEVICE)

            output = net(image)
            train_correct += get_acc(output, label)
            loss = loss_function(output, label)
            train_loss += loss.item()

            # backward
            optimizer.zero_grad()  #把梯度置零,也就是把loss关于weight的导数变成0
            loss.backward()
            optimizer.step()

            #设置每多少个epoch输出一次损失
            if i % 2 == 0:
                train_loss_log = train_loss / (i + 1)
                train_correct_log = train_correct / (i + 1)
                logger.add_scalar('train_loss', train_loss_log, steps)
                logger.add_scalar('train_acc', train_correct_log, steps)
                print(
                    'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tAcc: {:0.4f}\tLR: {:0.6f}'
                    .format(train_loss / (i + 1),
                            train_correct / (i + 1),
                            optimizer.param_groups[0]['lr'],
                            epoch=epoch,
                            trained_samples=i * config.BATCH_SIZE + len(image),
                            total_samples=length_train))
        # start to save best performance model 保存当前训练的最佳的模型
        acc = test_correct / (i + 1)
        if epoch > config.MILESTONES[1] and best_acc < acc:
            torch.save(net.state_dict(),
                       checkpoint_path.format(epoch=epoch, type='best'))
            best_acc = acc
            continue

        if not epoch % config.SAVE_EPOCH:
            torch.save(net.state_dict(),
                       checkpoint_path.format(epoch=epoch, type='regular'))

        ### eval ###
        net.eval()  #在测试前使用
        test_loss = 0.0  # cost function error
        test_correct = 0.0

        for i, data in enumerate(test_data):  #测试刚刚训练的epoch的准确率
            images, labels = data
            images, labels = images.to(config.DEVICE), labels.to(config.DEVICE)

            outputs = net(images)
            loss = loss_function(outputs, labels)
            test_loss += loss.item()
            test_correct += get_acc(outputs, labels)

            print(
                'Testing: [{test_samples}/{total_samples}]\tAverage loss: {:.4f}, Accuracy: {:.4f}'
                .format(test_loss / (i + 1),
                        test_correct / (i + 1),
                        test_samples=i * config.BATCH_SIZE + len(images),
                        total_samples=length_test))
        logger.add_scalar('test_loss', test_loss / (i + 1), epoch)
        logger.add_scalar('test_acc', test_correct / (i + 1), epoch)

        #eval
        net.eval()
        test_loss = 0.0
        test_correct = 0.0
        for i, data in enumerate(cat_test_data):
            images, labels = data
            images, labels = images.to(config.DEVICE), labels.to(config.DEVICE)
            outputs = net(images)
            loss = loss_function(outputs, labels)
            test_loss += loss.item()
            test_correct += get_acc(outputs, labels)
        logger.add_scalar('test_loss_cat', test_loss / (i + 1), epoch)
        logger.add_scalar('test_acc_cat', test_correct / (i + 1), epoch)

        #eval
        net.eval()
        test_loss = 0.0
        test_correct = 0.0
        for i, data in enumerate(dog_test_data):
            images, labels = data
            images, labels = images.to(config.DEVICE), labels.to(config.DEVICE)
            ouputs = net(images)
            loss = loss_function(outputs, labels)
            test_loss += loss.item()
            test_correct += get_acc(outputs, labels)
        logger.add_scalar('test_loss_dog', test_loss / (i + 1), epoch)
        logger.add_scalar('test_acc_dog', test_correct / (i + 1), epoch)

        print()
示例#4
0
def main(mode=None):

    config = load_config(mode)

    torch.manual_seed(config.SEED)
    torch.cuda.manual_seed(config.SEED)
    np.random.seed(config.SEED)
    random.seed(config.SEED)

    train_set = ImageFolder(config.TRAIN_PATH, transform=train_tf)
    length1 = len(train_set)
    train_data = torch.utils.data.DataLoader(train_set,
                                             batch_size=config.BATCH_SIZE,
                                             shuffle=True)
    iter_per_epoch = len(train_data)

    test_set = ImageFolder(config.TEST_PATH, transform=test_tf)
    test_data = torch.utils.data.DataLoader(test_set,
                                            batch_size=config.BATCH_SIZE,
                                            shuffle=False)

    # INIT GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU)
    if torch.cuda.is_available():
        config.DEVICE = torch.device("cuda")
        print('\nGPU IS AVAILABLE')
        torch.backends.cudnn.benchmark = True
    else:
        config.DEVICE = torch.device("cpu")

    # choose network
    if config.MODEL == 1:
        net = VGG16().to(config.DEVICE)
        print('The Model is VGG\n')
    if config.MODEL == 2:
        net = resnet34().to(config.DEVICE)
        print('The Model is ResNet34\n')
    if config.MODEL == 3:
        net = mobilenet().to(config.DEVICE)
        print('The Model is mobilenet\n')
    if config.MODEL == 4:
        net = shufflenet().to(config.DEVICE)
        print('The Model is shufflenet\n')
#     print(dir(net))
#     # choose train or test
#     if config.MODE == 1:
#         print("Start Training...\n")
#         net.train()
#     if config.MODE == 2:
#         print("Start Testing...\n")
#         net.test()

    optimizer = optim.SGD(net.parameters(),
                          lr=config.LR,
                          momentum=0.9,
                          weight_decay=5e-4)
    loss_function = nn.CrossEntropyLoss()
    train_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=config.MILESTONES, gamma=0.2)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * config.WARM)
    #     optimizer = optim.Adam(net.parameters(),lr=float(config.LR),betas=(config.BETA1, config.BETA2))

    # use tensorboard
    runs_path = os.path.join(config.PATH, 'runs')
    if not os.path.exists(runs_path):
        os.mkdir(runs_path)

#     writer=SummaryWriter(log_dir=runs_path)
#     input_tensor = torch.Tensor(12, 3, 32, 32).cuda()
#     writer.add_graph(net, Variable(input_tensor, requires_grad=True))

#create checkpoint folder to save model
    model_path = os.path.join(config.PATH, 'model')
    if not os.path.exists(model_path):
        os.mkdir(model_path)
    checkpoint_path = os.path.join(model_path, '{epoch}-{type}.pth')

    best_acc = 0.0
    for epoch in range(1, 100):
        if epoch > config.WARM:
            train_scheduler.step(epoch)

        ### train ###
        net.train()
        train_loss = 0.0  # cost function error
        train_correct = 0.0

        for i, data in enumerate(train_data):

            if epoch <= config.WARM:
                warmup_scheduler.step()

            length = len(train_data)
            image, label = data
            image, label = image.to(config.DEVICE), label.to(config.DEVICE)

            output = net(image)
            train_correct += get_acc(output, label)
            loss = loss_function(output, label)
            train_loss += loss.item()

            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            print(
                'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tAcc: {:0.4f}\tLR: {:0.6f}'
                .format(train_loss / (i + 1),
                        train_correct / (i + 1),
                        optimizer.param_groups[0]['lr'],
                        epoch=epoch,
                        trained_samples=i * 24 + len(image),
                        total_samples=len(train_data.dataset)))

    ##eval
        net.eval()
        test_loss = 0.0  # cost function error
        test_correct = 0.0

        for i, data in enumerate(test_data):
            images, labels = data
            images, labels = images.to(config.DEVICE), labels.to(config.DEVICE)

            outputs = net(images)
            loss = loss_function(outputs, labels)
            test_loss += loss.item()
            test_correct += get_acc(outputs, labels)

            print(
                'Test set: [{test_samples}/{total_samples}]\tAverage loss: {:.4f}, Accuracy: {:.4f}'
                .format(test_loss / (i + 1),
                        test_correct / (i + 1),
                        test_samples=i * 24 + len(images),
                        total_samples=len(test_data.dataset)))
        print()

        acc = test_correct / (i + 1)
        #start to save best performance model after learning rate decay to 0.01
        if epoch > config.MILESTONES[1] and best_acc < acc:
            torch.save(net.state_dict(),
                       checkpoint_path.format(epoch=epoch, type='best'))
            best_acc = acc
            continue

        if not epoch % config.SAVE_EPOCH:
            torch.save(net.state_dict(),
                       checkpoint_path.format(epoch=epoch, type='regular'))
示例#5
0
        callbacks=[lr_decay, cb1],
        verbose=1)
else:
    datagen.fit(X_train)
    model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
                        steps_per_epoch=len(X_train) / batch_size,
                        epochs=epoch,
                        validation_data=(X_test, y_test),
                        callbacks=[lr_decay, cb1],
                        verbose=1)

# make confusion matrix
preds_train = np.argmax(model.predict(X_train), axis=1)
preds_test = np.argmax(model.predict(X_test), axis=1)

acc_train = get_acc(y=np.argmax(y_train, axis=1), preds=preds_train)
acc_test = get_acc(y=np.argmax(y_test, axis=1), preds=preds_test)

save_f_train = 'data/{}/conf_mat/train_{}.png'.format(
    args.dataset, args.model)
save_f_test = 'data/{}/conf_mat/test_{}.png'.format(
    args.dataset, args.model)

make_confusion_matrix(
    y_row=np.argmax(y_train, axis=1),
    y_col=preds_train,
    save_file_name=save_f_train,
    dataset=args.dataset,
    title='acc_train : {:.3f}'.format(acc_train)
)
make_confusion_matrix(
示例#6
0
def main(mode=None):
    time_now = datetime.now().isoformat()
    config = load_config(mode)
    
    # 随机数种子
    torch.manual_seed(config.SEED)
    torch.cuda.manual_seed(config.SEED)
    np.random.seed(config.SEED)
    random.seed(config.SEED)
    
    # 记载训练集和测试集
    train_set = ImageFolder(config.TRAIN_PATH, transform=train_tf) 
    length_train = len(train_set)
    train_data=torch.utils.data.DataLoader(train_set,batch_size=config.BATCH_SIZE,shuffle=True)
    iter_per_epoch = len(train_data)

    test_set = ImageFolder(config.TEST_PATH, transform=test_tf)
    length_test = len(test_set)
    test_data=torch.utils.data.DataLoader(test_set, batch_size=config.BATCH_SIZE, shuffle=True)
    
    # INIT GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU)
    if torch.cuda.is_available():
        config.DEVICE = torch.device("cuda")
        print('\nGPU IS AVAILABLE')
        torch.backends.cudnn.benchmark = True
    else:
        config.DEVICE = torch.device("cpu")

    # choose network
    net = VGG16().to(config.DEVICE)
    print('The Model is VGG16\n')  
    
    # use tensorboardx
    if not os.path.exists(config.LOG_DIR):
        os.mkdir(config.LOG_DIR)
    writer = SummaryWriter(log_dir=os.path.join(
            config.LOG_DIR, time_now))

    # optimizer and loss function
    optimizer = optim.SGD(net.parameters(),lr=config.LR, momentum=0.9,weight_decay=5e-4)
    loss_function = nn.CrossEntropyLoss()

    # warmup
    train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.MILESTONES,gamma=0.5)
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * config.WARM)
                 
    # create checkpoint folder to save model
    model_path = os.path.join(config.PATH,'model')
    if not os.path.exists(model_path):
        os.mkdir(model_path)
    checkpoint_path = os.path.join(model_path,'{epoch}-{type}.pth')
                 
    best_acc = 0.0
    a = config.EPOCH

    for epoch in range(1, config.EPOCH):

        if epoch > config.WARM:
            train_scheduler.step(epoch)
    
        ### train ###
        net.train()   
        train_loss = 0.0 # cost function error
        train_correct = 0.0

        for i, data in enumerate(train_data):

            if epoch <= config.WARM:
                warmup_scheduler.step()

            length = len(train_data)
            image, label = data
            image, label = image.to(config.DEVICE),label.to(config.DEVICE)

            output = net(image)
            train_correct += get_acc(output, label)
            loss = loss_function(output, label)
            train_loss +=loss.item()

            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            last_layer = list(net.children())[-1]
            n_iter = (epoch-1) * iter_per_epoch +i +1
            for name, para in last_layer.named_parameters():
                if 'weight' in name:
                    writer.add_scalar('LastLayerGradients/grad_norm2_weights', para.grad.norm(), n_iter)
                if 'bias' in name:
                    writer.add_scalar('LastLayerGradients/grad_norm2_bias', para.grad.norm(), n_iter)
            
            print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tAcc: {:0.4f}\tLR: {:0.6f}'.format(
                train_loss/(i+1),
                train_correct/(i+1),
                optimizer.param_groups[0]['lr'],
                epoch=epoch,
                trained_samples=i * config.BATCH_SIZE + len(image),
                total_samples=length_train
            ))
            writer.add_scalar('Train/lr',optimizer.param_groups[0]['lr'] , n_iter)
            writer.add_scalar('Train/loss', (train_loss/(i+1)), n_iter)
            writer.add_scalar('Train/acc', (train_correct/(i+1)), n_iter)
        
        ## eval ### 
        if epoch%1==0:
#             net.eval()
#             test_loss = 0.0    
#             test_correct = 0.0

#             for i, data in enumerate(test_data):
#                 images, labels = data
#                 images, labels = images.to(config.DEVICE),labels.to(config.DEVICE)

#                 outputs = net(images)
#                 loss = loss_function(outputs, labels)
#                 test_loss += loss.item()
#                 test_correct += get_acc(outputs, labels)

#                 print('Testing: [{test_samples}/{total_samples}]\tAverage loss: {:.4f}, Accuracy: {:.4f}'.format(
#                 test_loss /(i+1),
#                 test_correct / (i+1),
#                 test_samples=i * config.BATCH_SIZE + len(images),
#                 total_samples=length_test))

#             writer.add_scalar('Test/Average loss', (test_loss/(i+1)), n_iter)
#             writer.add_scalar('Test/Accuracy', (test_correct/(i+1)), n_iter)
#             print()

            #start to save best performance model 
#             acc = test_correct/(i+1)  
#             if epoch > config.MILESTONES[1] and best_acc < acc:
#                 torch.save(net.state_dict(), checkpoint_path.format(epoch=epoch, type='best'))
#                 best_acc = acc
#                 continue

            if not epoch % config.SAVE_EPOCH:
                torch.save(net.state_dict(), checkpoint_path.format(epoch=epoch, type='regular'))
    writer.close()
示例#7
0
def do_train(
    cfg,
    model,
    metric_fc,
    train_loader,
    val_loader,
    optimizer,
    lr_schedule,
    loss_fn,
    loss_fn2,
    logger,
):
    output_dir = cfg.OUTPUT_DIR
    device = cfg.MODEL.DEVICE
    epochs = cfg.SOLVER.MAX_EPOCHS

    lfw_test_list = cfg.LFW_TEST_LIST

    # device_ids = [0, 1, 2, 3]
    # model = torch.nn.DataParallel(model, device_ids=device_ids)
    model.to(device)
    if metric_fc is not None:
        metric_fc.to(device)
    map_dict = read_pkl()
    for epoch in range(epochs):
        lr_schedule.step()
        model.train()

        # zero the loss
        train_loss = 0
        train_loss1 = 0
        train_loss2 = 0
        train_acc = 0
        label_mse_tensor = torch.tensor([])
        for iter, (images, targets) in enumerate(tqdm(train_loader)):
            iter += 1
            images = images.to(device)
            targets = targets.to(device)

            # read PEDCC weights

            tensor_empty = torch.Tensor([]).to(device)
            for target_index in targets:
                tensor_empty = torch.cat(
                    (tensor_empty,
                     map_dict[target_index.item()].float().to(device)), 0)

            label_mse_tensor = tensor_empty.view(-1, 512)
            label_mse_tensor = label_mse_tensor.to(
                device)  # PEDCC of each class

            # forward
            output_ = model(images)
            output = output_[0]
            t_loss1 = loss_fn(output, targets)  # PEDCC-AMSOFTMAX
            t_loss2 = loss_fn2(output_[1], label_mse_tensor)
            t_loss2 = t_loss2**cfg.METRIC.N
            t_loss = t_loss1 + t_loss2

            # backward
            optimizer.zero_grad()
            t_loss.backward()
            optimizer.step()

            train_loss += t_loss.item()
            train_loss1 += t_loss1.item()
            train_loss2 += t_loss2.item(
            )  # visual loss1 and loss2 in train stage
            train_acc += get_acc(output_[0], targets)

        valid_loss = 0
        valid_acc = 0
        if val_loader is not None and cfg.DATASETS.NAME == "CIFAR100":
            model = model.eval()
            with torch.no_grad():
                for images, targets in val_loader:
                    images = images.to(device)
                    targets = targets.to(device)

                    output_ = model(images)
                    # v_loss = metric_fc(feature, targets)
                    v_loss = loss_fn(
                        output_[0],
                        targets)  # Only amsoftmax loss is considered here
                    valid_loss += v_loss.item()
                    valid_acc += get_acc(output_[0], targets)

            avg_t_loss = train_loss / len(train_loader)
            avg_t1_loss = train_loss1 / len(train_loader)
            avg_t2_loss = train_loss2 / len(train_loader)
            avg_v_loss = valid_loss / len(val_loader)
            avg_train_acc = train_acc / len(train_loader)
            avg_val_acc = valid_acc / len(val_loader)
            lr = lr_schedule.get_lr()[0]
            epoch_str = f"Epoch {epoch}: Train Loss1: {avg_t1_loss}, Train Loss2: {avg_t2_loss}, Train Loss: {avg_t_loss}, " \
                f"Train Acc: {avg_train_acc}, Valid Loss: {avg_v_loss}, Valid Acc: {avg_val_acc}, LR: {lr} "
            logger.info(epoch_str)
        elif cfg.DATASETS.NAME == "FACE_DATA":
            pass

        torch.save(model.state_dict(), f"{output_dir}/model.pth")  # 一周期保存一次