示例#1
0
def train_model(training_loader):
    global epoch_list
    global learning_rate_list
    if train_from_last_model:
        model = load_model()
        learning_rate_list, epoch_list = load_info()
        print("Load the exist model and continue")
    else:
        model = SegNet(input_channel,output_channel).cuda()
        print("Train a new model")
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    for epoch in range(NUM_EPOCHS):
        t_start = time.time()
        # i = 0
        count_batch = 0    #count loss
        loss_sum = 0
        epoch_list.append(epoch+1)
        # x_axis_list = []   #graph
        # y_axis_list = []   #graph
        # tqdm.write('epoch = {}'.format(epoch+1))
        for i, batch in enumerate(tqdm(training_loader)):
            # load data
            input_tensor = Variable(batch['camera_5']).cuda()
            target_tensor = Variable(batch['fg_mask']).cuda()
            predicted_tensor, softmaxed_tensor = model(input_tensor)
            
            criterion = torch.nn.CrossEntropyLoss().cuda()
            #criterion = nn.BCEWithLogitsLoss().cuda()
            
            optimizer.zero_grad()
            loss = criterion(predicted_tensor, target_tensor)
            #print('loss = {}'.format(loss))
            loss.backward()
            optimizer.step()

            loss_sum += loss
            count_batch += 1
            torch.cuda.empty_cache()
        # tqdm().clear()
        average_loss = loss_sum / count_batch
        learning_rate_list.append(float(average_loss))
        tqdm.write('{} epoch: loss = {}'.format(epoch+1,average_loss))
        plot_learning_curve(len(epoch_list))
        save_model(model)
        save_info()
    return model
示例#2
0
def main(_run, _config, world_size, rank, init_method, datadir, batch_size,
         num_workers, outdir_suffix, outdir, lr, wd, warmup, num_epochs,
         nsamples):
    cudnn.benchmark = True
    device = torch.device('cuda:0')  # device is set by CUDA_VISIBLE_DEVICES
    torch.cuda.set_device(device)

    # rank 0 creates experiment observer
    is_master = rank == 0

    # rank joins process group
    print('rank', rank, 'init_method', init_method)
    dist.init_process_group('nccl',
                            rank=rank,
                            world_size=world_size,
                            init_method=init_method)

    # actual training stuff
    train = make_loader(
        pt.join(datadir, '') if datadir else None,
        batch_size,
        device,
        world_size,
        rank,
        num_workers,
        # this the parameter based on which augmentation is applied to the data
        gpu_augmentation=False,
        image_rng=None,
        nsamples=nsamples)

    # lr is scaled linearly to original batch size of 256
    # world_batch_size = world_size * batch_size
    # k = world_batch_size / 256
    # lr = k * lr

    # outdir stuff
    if outdir is None:
        outdir = pt.join(ROOT, '../exp/', outdir_suffix)

    model = Net(num_classes=500, batch_size=batch_size)
    print('\n network parameters ', len(list(model.parameters())))

    model = model.to(device)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[device])
    #model = Unpacker(model)

    optimizer, policy = make_policy(num_epochs, model, lr, 0.9, wd)

    # loss for autoencoder
    loss = L1Loss(output_key='output', target_key='target_image').to(device)
    # this loss is for classifier
    classifier_loss = CrossEntropyLoss(output_key='probs',
                                       target_key='label').to(device)
    trainer = Trainer(model,
                      optimizer,
                      loss,
                      classifier_loss,
                      rank,
                      AccuracyMetric(output_key='softmax_output',
                                     target_key='label'),
                      policy,
                      None,
                      train,
                      None,
                      outdir,
                      snapshot_interval=4 if is_master else None,
                      quiet=True if not is_master else False)

    print('\n Number of epochs are: ', num_epochs)
    start = datetime.now()
    with train:
        trainer.train(num_epochs, start_epoch=0)

    print("Training complete in: " + str(datetime.now() - start))
示例#3
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    # Check if the save directory exists or not
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    model = SegNet(3, 3)

    #model.features = torch.nn.DataParallel(model.features)
    if use_gpu:
        model.cuda()

    # Optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            #optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.evaluate, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # data_transforms = {
    #     'train': transforms.Compose([
    #         transforms.Scale(256),
    #         transforms.RandomSizedCrop(224),
    #         transforms.RandomHorizontalFlip(),
    #         transforms.ToTensor(),
    #         transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
    #     ]),
    #     'val': transforms.Compose([
    #         transforms.Scale(256),
    #         transforms.CenterCrop(224),
    #         transforms.RandomHorizontalFlip(),
    #         transforms.ToTensor(),
    #         transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
    #     ]),
    # }

    data_transforms = {
        'train':
        transforms.Compose([
            transforms.Scale((224, 224)),
            transforms.ToTensor(),
        ]),
        'val':
        transforms.Compose([
            transforms.Scale((224, 224)),
            transforms.ToTensor(),
        ]),
    }

    data_dir = '/media/salman/DATA/NUST/MS RIME/Thesis/MICCAI Dataset/miccai_all_images'

    image_datasets = {
        x: miccaiDataset(os.path.join(data_dir, x), data_transforms[x])
        for x in ['train', 'val']
    }

    dataloaders = {
        x: torch.utils.data.DataLoader(image_datasets[x],
                                       batch_size=args.batch_size,
                                       shuffle=True,
                                       num_workers=args.workers)
        for x in ['train', 'val']
    }
    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

    # Define loss function (criterion) and optimizer
    criterion = nn.MSELoss().cuda()

    if args.half:
        model.half()
        criterion.half()

    #optimizer = torch.optim.SGD(model.parameters(), args.lr,
    #                            momentum=args.momentum,
    #                            weight_decay=args.weight_decay)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    if args.evaluate:
        validate(dataloaders['val'], model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        #adjust_learning_rate(optimizer, epoch)

        # Train for one epoch
        train(dataloaders['train'], model, criterion, optimizer, epoch)

        # Evaulate on validation set
        prec1 = validate(dataloaders['val'], model, criterion)
        prec1 = prec1.cpu().data.numpy()

        # Remember best prec1 and save checkpoint
        print(prec1)
        print(best_prec1)
        is_best = prec1 < best_prec1
        best_prec1 = min(prec1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                #'optimizer': optimizer.state_dict(),
            },
            is_best,
            filename=os.path.join(args.save_dir,
                                  'checkpoint_{}.tar'.format(epoch)))
示例#4
0
def handler(context):
    # Dataset
    dataset_alias = context.datasets
    train_dataset_id = dataset_alias['train']
    val_dataset_id = dataset_alias['val']

    trainset = SegmentationDatasetFromAPI(train_dataset_id,
                                          transform=SegNetAugmentation(MEANS))
    valset = SegmentationDatasetFromAPI(val_dataset_id,
                                        transform=SegNetAugmentation(
                                            MEANS, False))
    class_weight = calc_weight(
        SegmentationDatasetFromAPI(train_dataset_id,
                                   transform=SegNetAugmentation(MEANS, False)))
    class_weight = class_weight.to(device)

    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=BATCHSIZE,
                                              shuffle=True,
                                              num_workers=0)
    valloader = torch.utils.data.DataLoader(valset,
                                            batch_size=BATCHSIZE,
                                            shuffle=False,
                                            num_workers=0)

    # Model
    net = SegNet(3, n_class=len(camvid_label_names))
    net = net.to(device)

    # Optimizer
    #criterion = PixelwiseSoftmaxClassifier(weight=class_weight)
    criterion = torch.nn.CrossEntropyLoss(weight=class_weight, ignore_index=-1)
    optimizer = optim.SGD(net.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[150, 250],
                                         gamma=0.1)

    statistics = Statistics(epochs)

    for epoch in range(epochs):
        scheduler.step()

        train_loss, train_acc = train(net, optimizer, trainloader, criterion,
                                      epoch)
        test_loss, test_acc = test(net, valloader, criterion, epoch)

        # Reporting
        print(
            '[{:d}] main/loss: {:.3f} main/acc: {:.3f}, main/validation/loss: {:.3f}, main/validation/acc: {:.3f}'
            .format(epoch + 1, train_loss, train_acc, test_loss, test_acc))

        statistics(epoch + 1, train_loss, train_acc, test_loss, test_acc)
        writer.add_scalar('main/loss', train_loss, epoch + 1)
        writer.add_scalar('main/acc', train_acc, epoch + 1)
        writer.add_scalar('main/validation/loss', test_loss, epoch + 1)
        writer.add_scalar('main/validation/acc', test_acc, epoch + 1)

    torch.save(net.state_dict(),
               os.path.join(ABEJA_TRAINING_RESULT_DIR, 'model.pth'))
示例#5
0
文件: main.py 项目: ninfueng/ninpy
        num_workers=hparams.num_workers,
        pin_memory=True,
    )

    model = SegNet(11, 3, drop_rate=0.2)
    writer.add_graph(model, torch.zeros(1, 3, 360, 480))
    model = model.to(device)
    cls_w = class_weights(train_dataset).astype(np.float32)
    cls_w = torch.from_numpy(cls_w[:-1])

    criterion = nn.CrossEntropyLoss(reduction="mean",
                                    ignore_index=11,
                                    weight=cls_w).to(device)
    # optimizer = optim.Adam(
    optimizer = optim.SGD(
        model.parameters(),
        lr=float(hparams.init_lr),
        weight_decay=float(hparams.weight_decay),
        momentum=0.9,
    )

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=hparams.opt_lv,
                                      verbosity=1)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               milestones=hparams.step_size,
                                               gamma=hparams.step_down_rate)

    best_acc = 0.0
    pbar = tqdm(range(hparams.epochs))