Exemplo n.º 1
0
def train():
        conf = Config()
        # 打印模型配置信息
        conf.dump()
        parser = argparse.ArgumentParser(description='图片分类模型训练')
        parser.add_argument(
                '--resume_checkpoint', action='store', type=str, default='model/checkpoint.pth',
                help='从模型的checkpoint恢复模型,并继续训练,如果resume_checkpoint这个参数提供'
                     '这些参数将忽略--arch, --learning_rate, --hidden_units, and --drop_p')
        args = parser.parse_args()

        #加载数据
        dataloaders, class_to_idx = load_data(conf.data_directory)

        #创建模型,如果模型文件存在
        if args.resume_checkpoint and os.path.exists(args.resume_checkpoint):
                #加载checkpoint
                print('resume_checkpoint已存在,开始加载模型')
                model, optimizer, epoch, history = load_checkpoint(
                        checkpoint_path=args.resume_checkpoint,
                        load_optimizer=True, gpu=conf.cuda)
                start_epoch = epoch + 1
        else:
                #创建新模型和优化器
                print('resume_checkpoint未设置或模型文件不存在,创建新的模型')
                model = create_model(
                        arch=conf.arch, class_to_idx=class_to_idx,
                        hidden_units=conf.hidden_units, drop_p=conf.dropout)
                optimizer = create_optimizer(model=model, lr=conf.learning_rate)
                start_epoch = 1
                history = None

        #训练模型
        history, best_epoch = train_model(
                dataloaders=dataloaders, model=model,
                optimizer=optimizer, gpu=conf.cuda, start_epoch=start_epoch,
                epochs=conf.epochs, train_history=history)

        #测试集上测试模型
        test_acc = test_model(dataloader=dataloaders['test'], model=model, gpu=conf.cuda)
        print(f'模型在测试集上的准确率是 {(test_acc * 100):.2f}%')

        #保存模型
        save_checkpoint(
                save_path=conf.save_path+conf.save_name, epoch=best_epoch, model=model,
                optimizer=optimizer, history=history)

        #绘制历史记录
        plot_history(history)
Exemplo n.º 2
0
    # train_recalls.append(avg_train_recall)
    print("Train loss: ", avg_train_loss)
    print("Train recall: ", avg_train_recall)
    writer.add_scalar("Loss/train", avg_train_loss, ep)
    writer.add_scalar("Recall/train", avg_train_recall, ep)

    avg_val_loss, avg_val_recall = model_utils.validate_model(
        rec_sys_model, loss_func, valid_loader, ep, top_k, val_display_step)
    # val_losses.append(avg_val_loss)
    # val_recalls.append(avg_val_recall)
    print("Val loss: ", avg_val_loss)
    print("Val recall: ", avg_val_recall)
    writer.add_scalar("Loss/val", avg_val_loss, ep)
    writer.add_scalar("Recall/val", avg_val_recall, ep)

    avg_test_loss, avg_test_recall = model_utils.test_model(
        rec_sys_model, loss_func, test_loader, ep, top_k, test_display_step)
    # test_losses.append(avg_test_loss)
    # test_recalls.append(avg_test_recall)

    writer.add_scalar("Loss/test", avg_test_loss, ep)
    writer.add_scalar("Recall/test", avg_test_recall, ep)

    if (avg_test_recall > recall_max):
        print('Test loss decrease from ({:.6f} --> {:.6f}) '.format(
            loss_min, avg_test_loss))
        print('recall increase from {:.6f} --> {:.6f}'.format(
            recall_max, avg_test_recall))
        print('Can save model')
        # check_point.save_ckpt(checkpoint, True, model_name, checkpoint_dir, best_model_dir, ep)
        check_point.save_config_param(best_model_dir, model_name, config_param)
        torch.save(rec_sys_model, best_model_dir + model_name + '.pt')
Exemplo n.º 3
0
def run(rank, model, train_pics, train_bsz):
    workers = [int(v) for v in str(args.learners).split('-')]
    _group = [w for w in workers].append(rank)
    group = dist.new_group(_group)

    for p in model.parameters():
        scatter_p_list = [p.data for _ in range(len(workers) + 1)]
        dist.scatter(tensor=p.data, scatter_list=scatter_p_list, group=group)

    print('Model Sent Finished!')

    print('Begin!')

    transform = transforms.Compose([
        transforms.Resize(128),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    tmp = [
        (0, 0)
        for _ in range(int(math.ceil(train_pics / (len(workers) * train_bsz))))
    ]

    pre_time = datetime.datetime.now()
    for epoch in range(args.epochs):
        for batch_idx, (_, _) in enumerate(tmp):
            for param in model.parameters():
                tensor = torch.zeros_like(param.data)

                # FIXME FIXED:gather_list中的每个Tensor都必须是新的对象,否则会出问题
                gather_list = [
                    torch.zeros_like(param.data)
                    for _ in range(len(workers) + 1)
                ]
                dist.gather(tensor=tensor,
                            gather_list=gather_list,
                            group=group)
                tensor = sum(gather_list) / len(workers)
                param.data -= tensor
                scatter_list = [param.data for _ in range(len(workers) + 1)]
                dist.scatter(tensor=tensor,
                             scatter_list=scatter_list,
                             group=group)

            print('Done {}/{}!'.format(batch_idx, len(tmp)))
        print('Done Epoch {}/{}!'.format(epoch + 1, args.epochs))

    end_time = datetime.datetime.now()
    # 测试ps的模型准确率
    h, remainder = divmod((end_time - pre_time).seconds, 3600)
    m, s = divmod(remainder, 60)
    time_str = "Time %02d:%02d:%02d" % (h, m, s)

    test_dataset = datasets.CIFAR10(args.data_dir,
                                    train=False,
                                    download=False,
                                    transform=transform)
    criterion = torch.nn.CrossEntropyLoss()
    test_data = DataLoader(test_dataset, batch_size=128, shuffle=True)

    test_loss, acc = test_model(dist.get_rank(),
                                model,
                                test_data,
                                criterion=criterion)
    print('total time ' + str(time_str))
    f = open('./result_' + str(rank) + '_' + args.model + '.txt', 'a')
    f.write('Rank: ' + str(rank) + ', \tEpoch: ' + str(args.epochs) +
            ', \tTestLoss: ' + str(test_loss) + ', \tTestAcc: ' + str(acc) +
            ', \tTotalTime: ' + str(time_str) + '\n')
    f.close()
Exemplo n.º 4
0
def run(rank, workers, model, save_path, train_data, test_data):
    # 获取ps端传来的模型初始参数
    _group = [w for w in workers].append(0)
    group = dist.new_group(_group)

    for p in model.parameters():
        tmp_p = torch.zeros_like(p)
        dist.scatter(tensor=tmp_p, src=0, group=group)
        p.data = tmp_p
    print('Model recved successfully!')

    optimizer = MySGD(model.parameters(), lr=0.01, momentum=0.5)
    criterion = torch.nn.CrossEntropyLoss()
    print('Begin!')

    for epoch in range(args.epochs):
        pre_time = datetime.datetime.now()
        model.train()

        # AlexNet在指定epoch减少学习率LR
        if args.model == 'AlexNet':
            if epoch + 1 in [40, 60]:
                for param_group in optimizer.param_groups:
                    param_group['lr'] *= 0.1
                    print('LR Decreased! Now: {}'.format(param_group['lr']))

        epoch_train_loss = 0
        for batch_idx, (data, target) in enumerate(train_data):
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()

            delta_ws = optimizer.get_delta_w()
            # 同步操作
            for idx, param in enumerate(model.parameters()):
                dist.gather(tensor=delta_ws[idx], dst=0, group=group)
                recv = torch.zeros_like(delta_ws[idx])
                dist.scatter(tensor=recv, src=0, group=group)
                param.data = recv

            epoch_train_loss += loss.data.item()
            print('Rank {}, Epoch {}, Batch {}/{}, Loss:{}'.format(
                rank, epoch, batch_idx, len(train_data), loss.data.item()))

        end_time = datetime.datetime.now()
        h, remainder = divmod((end_time - pre_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = "Time %02d:%02d:%02d" % (h, m, s)

        epoch_train_loss /= len(train_data)
        epoch_train_loss = format(epoch_train_loss, '.4f')

        # 训练结束后进行test
        test_loss, acc = test_model(rank,
                                    model,
                                    test_data,
                                    criterion=criterion)
        print('total time ' + str(time_str))
        f = open('./result_' + str(rank) + '_' + args.model + '.txt', 'a')
        f.write('Rank: ' + str(rank) + ', \tEpoch: ' + str(epoch + 1) +
                ', \tTrainLoss: ' + str(epoch_train_loss) + ', \tTestLoss: ' +
                str(test_loss) + ', \tTestAcc: ' + str(acc) + ', \tTime: ' +
                str(time_str) + '\n')
        f.close()

        if (epoch + 1) % 5 == 0:
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            torch.save(
                model.state_dict(),
                save_path + '/' + args.model + '_' + str(epoch + 1) + '.pkl')
Exemplo n.º 5
0
def run(rank, workers, model, save_path, train_data, test_data):
    # 获取ps端传来的模型初始参数
    level_0 = [int(v) for v in str(args.level_0).split('-')]
    level_1 = [int(v) for v in str(args.level_1).split('-')]

    _level_0_group = [w for w in level_0]
    level_0_group = dist.new_group(_level_0_group)

    _level_1_group = [w for w in level_1]
    level_1_group = dist.new_group(_level_1_group)

    optimizer = MySGD(model.parameters(), lr=0.01, momentum=0.5)
    criterion = torch.nn.CrossEntropyLoss()
    print('The model was successfully initialized!')
    print('Begin!')

    for epoch in range(args.epochs):
        pre_time = datetime.datetime.now()
        model.train()

        # AlexNet在指定epoch减少学习率LR
        if args.model == 'AlexNet':
            if epoch + 1 in [40, 60]:
                for param_group in optimizer.param_groups:
                    param_group['lr'] *= 0.1
                    print('LR Decreased! Now: {}'.format(param_group['lr']))

        epoch_train_loss = 0
        epoch_train_acc = 0
        for batch_idx, (data, target) in enumerate(train_data):
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()

            delta_ws = optimizer.get_delta_w()

            # 同步操作
            for idx, param in enumerate(model.parameters()):

                global split_tensor
                split_tensor = torch.chunk(delta_ws[idx], 2, 0)
                split_tensor = list(split_tensor)
                thread1 = MyThread(0, len(_level_0_group), level_0_group)
                thread2 = MyThread(1, len(_level_1_group), level_1_group)
                thread1.start()
                thread2.start()
                thread1.join()
                thread2.join()

                thread1 = MyThread(0, len(_level_1_group), level_1_group)
                thread2 = MyThread(1, len(_level_0_group), level_0_group)
                thread1.start()
                thread2.start()
                thread1.join()
                thread2.join()
                split_tensor = tuple(split_tensor)
                param.data -= torch.cat((split_tensor[0], split_tensor[1]), 0)

            epoch_train_loss += loss.data.item()
            epoch_train_acc += get_acc(output, target)
            print('Rank {}, Epoch {}, Batch {}/{}, Loss:{}'
                  .format(rank, epoch, batch_idx, len(train_data), loss.data.item()))

        end_time = datetime.datetime.now()
        h, remainder = divmod((end_time-pre_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = "Time %02d:%02d:%02d" % (h, m, s)

        epoch_train_loss /= len(train_data)
        epoch_train_loss = format(epoch_train_loss, '.4f')

        # 训练结束后进行test
        test_loss, acc = test_model(rank, model, test_data, criterion=criterion)
        print('total time ' + str(time_str))
        f = open('./result_' + str(rank) + '_' + args.model + '.txt', 'a')
        f.write('Rank: ' + str(rank) +
                ', \tEpoch: ' + str(epoch + 1) +
                ', \tTrainLoss: ' + str(epoch_train_loss) +
                ', \tTrainAcc: ' + str(epoch_train_acc / len(train_data)) +
                ', \tTestLoss: ' + str(test_loss) +
                ', \tTestAcc: ' + str(acc) +
                ', \tTime: ' + str(time_str) + '\n')
        f.close()

        if (epoch + 1) % 5 == 0:
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            torch.save(model.state_dict(),
                       save_path + '/' + args.model + '_' + str(epoch + 1) + '.pkl')
Exemplo n.º 6
0
if pre_tr_model == 'vgg16':
    input_units = model.classifier[0].in_features
    model.name = 'vgg16'
elif pre_tr_model == 'vgg19':
    input_units = model.classifier[0].in_features
    model.name = 'vgg19'
elif pre_tr_model == 'densenet':
    input_units = model.classifier.in_features
    model.name = 'densenet'
elif pre_tr_model == 'alexnet':
    input_units = model.classifier[1].in_features
    model.name = 'alexnet'

#building classifier of model
model = build_classifier(model, input_units, hidden_units, dropout)
print(model)

#Set criterion and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.classifier.parameters(), learning_rate)
model.to(device)

# Training model
model = train_model(model, epochs, trainloader, validloader, criterion,
                    optimizer, device)

# Testing model
test_model(model, testloader, device)

# Saving model
save_model(model, train_data, save_dir)
Exemplo n.º 7
0
from model_utils import train_model, test_model
from cnn_model_with_output_conv import CNNModel

model = CNNModel()

train_model(model, "cnn_model_with_output_conv", 20000)
test_model(model, "cnn_model_with_output_conv")
Exemplo n.º 8
0
def test_model(model, ds_config):
    datasets = ds_config_to_datasets(ds_config)
    model_utils.test_model(model, datasets["test"], MIN_CLASS_LABEL,
                           MAX_CLASS_LABEL, "muh_confusion.png")
Exemplo n.º 9
0
#constants
#output_cats = 102  # number of flower classifications (can make this a command line input for other training)

args = get_args_train()

if (args.device == 'gpu' and torch.cuda.is_available()):
    device = torch.device('cuda')
else:
    print(
        "Model should be trained on GPU, enable and select --gpu gpu for training"
    )

train_data, test_data, validation_data, trainloader, testloader, validationloader = load_data(
    args.data_directory)

pretrain_model, arch_inFeatures = pretrained_model(args.arch)

model, criterion = create_classifier(pretrain_model, arch_inFeatures,
                                     args.hidden_units, args.output_cats)

optimizer = optim.Adam(model.classifier.parameters(), lr=args.lr)

trained_model = train_model(model, args.epochs, trainloader, validationloader,
                            device, optimizer, criterion)

tested_model = test_model(trained_model, testloader, device, optimizer,
                          criterion)

save_checkpoint(trained_model, args.save_directory, args.arch, train_data,
                optimizer, args.epochs, args.hidden_units)
Exemplo n.º 10
0
flat_dim = get_flat_dim(INPUT_DIM, N_CONV, CONV_FILTERS, K_SIZES, P_KERNELS,
                        STRIDES, P_STRIDES, PADDINGS)
model = ConvNet(N_CONV, N_POOL, N_FC, CONV_FILTERS, K_SIZES, P_KERNELS,
                STRIDES, P_STRIDES, PADDINGS, FC_DIMS, N_MLP, MLP_DIMS,
                BATCH_TILE, INPUT_DIM[0], flat_dim, DEVICE).to(DEVICE)

# --- IHC pretrained weights ---
if os.path.exists(cnn_file):
    weights = torch.load(cnn_file, map_location=DEVICE)
    model.load_state_dict(weights['state_dict'], strict=False)
    model.to(DEVICE)

else:
    print('WARNING: model file does not exists!')

if mode == 'inference':
    predictions = inference(DEVICE,
                            model,
                            AGGREGATION,
                            data_path,
                            BATCH_TILE=BATCH_TILE)

elif mode == 'test':
    criterion = nn.CrossEntropyLoss()
    ACC, F1, precision, recall = test_model(DEVICE,
                                            model,
                                            AGGREGATION,
                                            criterion,
                                            data_path,
                                            BATCH_TILE=BATCH_TILE)