Exemplo n.º 1
0
def main():
    torch.autograd.set_detect_anomaly(True)
    logger.info('Start to declare training variable')
    if torch.cuda.is_available():
        cfg.device = torch.device("cuda")
        torch.cuda.set_device(cfg.local_rank)
    else:
        cfg.device = torch.device("cpu")
    logger.info('Session will be ran in device: [%s]' % cfg.device)
    start_epoch = 0
    best_acc = 0.

    logger.info('Start to prepare data')
    # get transformers
    # train_transform is for data perturbation
    train_transform = transforms.get(train=True)
    # test_transform is for evaluation
    test_transform = transforms.get(train=False)
    # reduced_transform is for original training data
    reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size,
                                              cfg.tfm_blur, cfg.tfm_means,
                                              cfg.tfm_stds,
                                              cfg.tfm_adaptive_thresholding)
    # get datasets
    # each head should have its own trainset
    train_splits = dict(cifar100=[['train', 'test']],
                        image_folder_wrapper=[['train']],
                        stl10=[['train+unlabeled', 'test'], ['train', 'test']])
    test_splits = dict(cifar100=['train', 'test'],
                       image_folder_wrapper=['test'],
                       stl10=['train', 'test'])
    # instance dataset for each head
    # otrainset: original trainset
    otrainset = [
        ConcatDataset([
            datasets.get(split=split, transform=reduced_transform)
            for split in train_splits[cfg.dataset][hidx]
        ]) for hidx in range(len(train_splits[cfg.dataset]))
    ]
    # ptrainset: perturbed trainset
    ptrainset = [
        ConcatDataset([
            datasets.get(split=split, transform=train_transform)
            for split in train_splits[cfg.dataset][hidx]
        ]) for hidx in range(len(train_splits[cfg.dataset]))
    ]
    # testset
    testset = ConcatDataset([
        datasets.get(split=split, transform=test_transform)
        for split in test_splits[cfg.dataset]
    ])
    # declare data loaders for testset only
    test_loader = DataLoader(testset,
                             batch_size=cfg.batch_size,
                             shuffle=False,
                             num_workers=cfg.num_workers)

    logger.info('Start to build model')
    net = networks.get()
    criterion = PUILoss(cfg.pica_lamda, cfg.pica_target, cfg.pica_iic)
    optimizer = optimizers.get(
        params=[val for _, val in net.trainable_parameters().items()])
    lr_handler = lr_policy.get()

    # load session if checkpoint is provided
    if cfg.resume:
        assert os.path.exists(cfg.resume), "Resume file not found"
        ckpt = torch.load(cfg.resume)
        logger.info('Start to resume session for file: [%s]' % cfg.resume)
        net.load_state_dict(ckpt['net'])
        best_acc = ckpt['acc']
        start_epoch = ckpt['epoch']

    # move modules to target device
    if int(os.environ["WORLD_SIZE"]) > 1:
        dist.init_process_group(backend="nccl", init_method="env://")
    print("world size: {}".format(os.environ["WORLD_SIZE"]))
    print("rank: {}".format(cfg.local_rank))
    synchronize()

    criterion = criterion.to(cfg.device)
    net = net.to(cfg.device)

    if int(os.environ["WORLD_SIZE"]) > 1:
        net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
        net = torch.nn.parallel.DistributedDataParallel(
            net,
            device_ids=[cfg.local_rank],
            find_unused_parameters=True,
            output_device=cfg.local_rank).cuda()

    # Only rank 0 needs a SummaryWriter
    if cfg.local_rank == 0:
        # tensorboard writer
        writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir)
    else:
        writer = None

    # start training
    lr = cfg.base_lr
    epoch = start_epoch

    logger.info('Start to evaluate after %d epoch of training' % epoch)
    acc = evaluate(net, test_loader, writer, epoch)

    if not cfg.debug and cfg.local_rank == 0:
        # save checkpoint
        is_best = acc > best_acc
        best_acc = max(best_acc, acc)
        save_checkpoint(
            {
                'net': net.state_dict(),
                'optimizer': optimizer.state_dict(),
                'acc': acc,
                'epoch': epoch
            },
            is_best=is_best)

    while lr > 0 and epoch < cfg.max_epochs:

        lr = lr_handler.update(epoch, optimizer)

        logger.info('Start to train at %d epoch with learning rate %.5f' %
                    (epoch, lr))
        train(epoch, net, otrainset, ptrainset, optimizer, criterion, writer)

        epoch += 1

        logger.info('Start to evaluate after %d epoch of training' % epoch)
        acc = evaluate(net, test_loader, writer, epoch)

        if not cfg.debug and cfg.local_rank == 0:
            writer.add_scalar('Train/Learing_Rate', lr, epoch)
            # save checkpoint
            is_best = acc > best_acc
            best_acc = max(best_acc, acc)
            save_checkpoint(
                {
                    'net': net.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'acc': acc,
                    'epoch': epoch
                },
                is_best=is_best)

    logger.info('Done')
Exemplo n.º 2
0
def main():

    logger.info('Start to declare training variable')
    cfg.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info('Session will be ran in device: [%s]' % cfg.device)
    start_epoch = 0
    best_acc = 0.

    logger.info('Start to prepare data')
    # get transformers
    # train_transform is for data perturbation
    train_transform = transforms.get(train=True)
    # test_transform is for evaluation
    test_transform = transforms.get(train=False)
    # reduced_transform is for original training data
    reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size,
                                              cfg.tfm_means, cfg.tfm_stds)
    # get datasets
    # each head should have its own trainset
    train_splits = dict(cifar100=[['train', 'test']],
                        stl10=[['train+unlabeled', 'test'], ['train', 'test']])
    test_splits = dict(cifar100=['train', 'test'], stl10=['train', 'test'])
    # instance dataset for each head
    # otrainset: original trainset
    otrainset = [
        ConcatDataset([
            datasets.get(split=split, transform=reduced_transform)
            for split in train_splits[cfg.dataset][hidx]
        ]) for hidx in xrange(len(train_splits[cfg.dataset]))
    ]
    # ptrainset: perturbed trainset
    ptrainset = [
        ConcatDataset([
            datasets.get(split=split, transform=train_transform)
            for split in train_splits[cfg.dataset][hidx]
        ]) for hidx in xrange(len(train_splits[cfg.dataset]))
    ]
    # testset
    testset = ConcatDataset([
        datasets.get(split=split, transform=test_transform)
        for split in test_splits[cfg.dataset]
    ])
    # declare data loaders for testset only
    test_loader = DataLoader(testset,
                             batch_size=cfg.batch_size,
                             shuffle=False,
                             num_workers=cfg.num_workers)

    logger.info('Start to build model')
    net = networks.get()
    criterion = PUILoss(cfg.pica_lamda)
    optimizer = optimizers.get(
        params=[val for _, val in net.trainable_parameters().iteritems()])
    lr_handler = lr_policy.get()

    # load session if checkpoint is provided
    if cfg.resume:
        assert os.path.exists(cfg.resume), "Resume file not found"
        ckpt = torch.load(cfg.resume)
        logger.info('Start to resume session for file: [%s]' % cfg.resume)
        net.load_state_dict(ckpt['net'])
        best_acc = ckpt['acc']
        start_epoch = ckpt['epoch']

    # move modules to target device
    net, criterion = net.to(cfg.device), criterion.to(cfg.device)

    # tensorboard wrtier
    writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir)
    # start training
    lr = cfg.base_lr
    epoch = start_epoch
    while lr > 0 and epoch < cfg.max_epochs:

        lr = lr_handler.update(epoch, optimizer)
        writer.add_scalar('Train/Learing_Rate', lr, epoch)

        logger.info('Start to train at %d epoch with learning rate %.5f' %
                    (epoch, lr))
        train(epoch, net, otrainset, ptrainset, optimizer, criterion, writer)

        logger.info('Start to evaluate after %d epoch of training' % epoch)
        acc, nmi, ari = evaluate(net, test_loader)
        logger.info('Evaluation results at epoch %d are: '
                    'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari))
        writer.add_scalar('Evaluate/ACC', acc, epoch)
        writer.add_scalar('Evaluate/NMI', nmi, epoch)
        writer.add_scalar('Evaluate/ARI', ari, epoch)

        epoch += 1

        if cfg.debug:
            continue

        # save checkpoint
        is_best = acc > best_acc
        best_acc = max(best_acc, acc)
        save_checkpoint(
            {
                'net': net.state_dict(),
                'optimizer': optimizer.state_dict(),
                'acc': acc,
                'epoch': epoch
            },
            is_best=is_best)

    logger.info('Done')
Exemplo n.º 3
0
def main():

    logger.info('Start to declare training variable')
    cfg.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info('Session will be ran in device: [%s]' % cfg.device)
    start_epoch = 0
    best_acc = 0.

    if cfg.pica:
        logger.info('Work at PICA !!!!')

    logger.info('Start to prepare data')
    # get transformers
    # train_transform is for data perturbation
    #train_transform = transforms.get(train=True)
    # test_transform is for evaluation
    test_transform = transforms.get(train=False)
    # reduced_transform is for original training data
    #reduced_transform = get_reduced_transform(cfg.tfm_resize, cfg.tfm_size,
    #                                            cfg.tfm_means, cfg.tfm_stds)
    # get datasets
    # each head should have its own trainset
    #train_splits = dict(cifar100=[['train', 'test']], cifar10=[['train', 'test']],
    #    stl10=[['train+unlabeled', 'test'], ['train', 'test']])
    test_splits = dict(cifar100=['train', 'test'],
                       cifar10=['train', 'test'],
                       stl10=['train', 'test'])
    # instance dataset for each head
    if cfg.dataset.startswith('stl') or cfg.dataset.startswith('cifar'):
        # otrainset: original trainset
        # otrainset = [ConcatDataset([datasets.get(split=split, transform=reduced_transform)
        #                 for split in train_splits[cfg.dataset][hidx]])
        #                 for hidx in xrange(len(train_splits[cfg.dataset]))]
        # # ptrainset: perturbed trainset
        # ptrainset = [ConcatDataset([datasets.get(split=split, transform=train_transform)
        #                 for split in train_splits[cfg.dataset][hidx]])
        #                 for hidx in xrange(len(train_splits[cfg.dataset]))]
        # testset
        testset = ConcatDataset([
            datasets.get(split=split, transform=test_transform)
            for split in test_splits[cfg.dataset]
        ])
    else:
        #    otrainset = [ImageFolder(root = cfg.data_root, transform = reduced_transform) for hidx in xrange(2)]
        #    ptrainset = [ImageFolder(root = cfg.data_root, transform = train_transform) for hidx in xrange(2)]
        testset = ImageFolder(root=cfg.data_root, transform=test_transform)
        logger.debug(
            'Dataset [%s] from directory [%s] is declared and %d samples '
            'are loaded' % (cfg.dataset, cfg.data_root, len(testset)))
    # declare data loaders for testset only
    test_loader = DataLoader(testset,
                             batch_size=cfg.batch_size,
                             shuffle=False,
                             num_workers=cfg.num_workers)

    logger.info('Start to build model')
    net = networks.get()
    criterion = DCLoss(cfg.dc_lamda)
    optimizer = optimizers.get(
        params=[val for _, val in net.trainable_parameters().iteritems()])
    lr_handler = lr_policy.get()

    # load session if checkpoint is provided
    if cfg.resume:
        assert os.path.exists(cfg.resume), "Resume file not found"
        ckpt = torch.load(cfg.resume)
        logger.info('Start to resume session for file: [%s]' % cfg.resume)
        if not cfg.pica:
            net.load_state_dict(ckpt['net'])
            best_acc = ckpt['acc']
            start_epoch = ckpt['epoch']
        else:
            net.load_state_dict(ckpt)
            best_acc = 0
            start_epoch = 0

    # data parallel
    if cfg.device == 'cuda' and len(cfg.gpus.split(',')) > 1:
        logger.info('Data parallel will be used for acceleration purpose')
        device_ids = range(len(cfg.gpus.split(',')))
        if not (hasattr(net, 'data_parallel')
                and net.data_parallel(device_ids)):
            net = nn.DataParallel(net, device_ids=device_ids)
        cudnn.benchmark = True
    else:
        logger.info('Data parallel will not be used for acceleration')

    # move modules to target device
    net, criterion = net.to(cfg.device), criterion.to(cfg.device)

    # tensorboard wrtier
    writer = SummaryWriter(cfg.debug, log_dir=cfg.tfb_dir)
    # start training
    lr = cfg.base_lr
    epoch = start_epoch

    logger.info('Start to evaluate after %d epoch of training' % epoch)
    acc, nmi, ari = evaluate(net, test_loader)
    logger.info('Evaluation results at epoch %d are: '
                'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari))
    writer.add_scalar('Evaluate/ACC', acc, epoch)
    writer.add_scalar('Evaluate/NMI', nmi, epoch)
    writer.add_scalar('Evaluate/ARI', ari, epoch)

    logger.info('Done')