예제 #1
0
def parse_train_segmentation_arguments(argv):
    description = 'Training a network for the task of image segmentation.'
    parser = ah.common_arg_parser(description=description)

    argument_groups.get_optimisation_group(parser)
    argument_groups.get_parallelisation_group(parser)
    argument_groups.get_augmentation_group(parser)

    args = _check_training_args(parser, argv)

    args.test_only = False
    args.target_size = args.target_size[0]
    if args.lr is None:
        args.lr = 0.02
    if args.weight_decay is None:
        args.weight_decay = 1e-4
    if '-e' not in argv and '--epochs' not in argv:
        args.epochs = 30
    # FIXME: cant take more than one GPU
    args.gpus = args.gpus[0]
    # TODO: why load weights is False?
    args.out_dir = prepare_training.prepare_output_directories(
        dataset_name=args.dataset,
        network_name=args.network_name,
        optimiser='sgd',
        load_weights=False,
        experiment_name=args.experiment_name,
        framework='pytorch')

    return args
예제 #2
0
def main(argv):
    args = argument_handler.train_arg_parser(argv)
    args.lr, args.weight_decay = default_configs.optimisation_params(
        'classification', args)
    # FIXME: cant take more than one GPU
    args.gpus = args.gpus[0]

    if 'random_' in args.dataset:
        args.dataset = args.dataset.replace('random_', '')
        args.random_labels = True
    else:
        args.random_labels = False

    # TODO: why load weights is False?
    args.out_dir = prepare_training.prepare_output_directories(
        dataset_name=args.dataset,
        network_name=args.network_name,
        optimiser='sgd',
        load_weights=False,
        experiment_name=args.experiment_name,
        framework='pytorch')

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn(
            'You have chosen to seed training. '
            'This will turn on the CUDNN deterministic setting, '
            'which can slow down your training considerably! '
            'You may see unexpected behavior when restarting from checkpoints.'
        )

    if args.gpus is not None:
        warnings.warn(
            'You have chosen a specific GPU. This will completely disable data parallelism.'
        )

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    json_file_name = os.path.join(args.out_dir, 'args.json')
    with open(json_file_name, 'w') as fp:
        json.dump(dict(args._get_kwargs()), fp, sort_keys=True, indent=4)

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker,
                 nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(ngpus_per_node, args)
예제 #3
0
def main(argv):
    args = argument_handler.train_arg_parser(argv, extra_args_fun)
    if args.prediction:
        from kernelphysiology.dl.utils import augmentation
        from kernelphysiology.dl.utils import arguments as ah
        args.manipulation, args.parameters = ah.create_manipulation_list(
            args.manipulation, args.parameters,
            augmentation.get_testing_augmentations())

    if args.lr is None:
        args.lr = 0.1
    if args.weight_decay is None:
        args.weight_decay = 1e-4
    # FIXME: cant take more than one GPU
    args.gpus = args.gpus[0]

    # TODO: why load weights is False?
    args.out_dir = prepare_training.prepare_output_directories(
        dataset_name=args.dataset,
        network_name=args.network_name,
        optimiser='sgd',
        load_weights=False,
        experiment_name=args.experiment_name,
        framework='pytorch')

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn(
            'You have chosen to seed training. '
            'This will turn on the CUDNN deterministic setting, '
            'which can slow down your training considerably! '
            'You may see unexpected behavior when restarting from checkpoints.'
        )

    if args.gpus is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker,
                 nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(ngpus_per_node, args)
예제 #4
0
def main_worker(ngpus_per_node, args):
    mean, std = model_utils.get_preprocessing_function(args.colour_space,
                                                       args.vision_type)

    if args.gpus is not None:
        print("Use GPU: {} for training".format(args.gpus))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + args.gpus
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    # create model
    if args.transfer_weights is not None:
        print('Transferred model!')
        (model, _) = model_utils.which_network(args.transfer_weights[0],
                                               args.task_type,
                                               num_classes=args.old_classes)
        which_layer = -1
        if len(args.transfer_weights) == 2:
            which_layer = args.transfer_weights[1]
        model = model_utils.NewClassificationModel(model, which_layer,
                                                   args.num_classes)
    elif args.custom_arch:
        print('Custom model!')
        supported_customs = ['resnet_basic_custom', 'resnet_bottleneck_custom']
        if os.path.isfile(args.network_name):
            checkpoint = torch.load(args.network_name, map_location='cpu')
            customs = None
            if 'customs' in checkpoint:
                customs = checkpoint['customs']
                # TODO: num_classes is just for backward compatibility
                if 'num_classes' not in customs:
                    customs['num_classes'] = 1000
            model = which_architecture(checkpoint['arch'], customs,
                                       args.contrast_head)
            args.network_name = checkpoint['arch']

            model.load_state_dict(checkpoint['state_dict'], strict=False)
        elif args.network_name in supported_customs:
            model = custom_models.__dict__[args.network_name](
                args.blocks,
                contrast_head=args.contrast_head,
                pooling_type=args.pooling_type,
                in_chns=len(mean),
                num_classes=args.num_classes,
                inplanes=args.num_kernels,
                kernel_size=args.kernel_size)
    elif args.pretrained:
        print("=> using pre-trained model '{}'".format(args.network_name))
        model = models.__dict__[args.network_name](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.network_name))
        model = models.__dict__[args.network_name]()

    # TODO: why load weights is False?
    args.out_dir = prepare_training.prepare_output_directories(
        dataset_name='contrast',
        network_name=args.network_name,
        optimiser='sgd',
        load_weights=False,
        experiment_name=args.experiment_name,
        framework='pytorch')
    # preparing the output folder
    create_dir(args.out_dir)
    json_file_name = os.path.join(args.out_dir, 'args.json')
    with open(json_file_name, 'w') as fp:
        json.dump(dict(args._get_kwargs()), fp, sort_keys=True, indent=4)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpus is not None:
            torch.cuda.set_device(args.gpus)
            model.cuda(args.gpus)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpus])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpus is not None:
        torch.cuda.set_device(args.gpus)
        model = model.cuda(args.gpus)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if (args.network_name.startswith('alexnet')
                or args.network_name.startswith('vgg')):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpus)

    # optimiser
    if args.transfer_weights is None:
        optimizer = torch.optim.SGD(model.parameters(),
                                    args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    else:
        # for p in model.features.parameters():
        #     p.requires_grad = False
        params_to_optimize = [
            {
                'params': [p for p in model.features.parameters()],
                'lr': 1e-6
            },
            {
                'params': [p for p in model.fc.parameters()]
            },
        ]
        optimizer = torch.optim.SGD(params_to_optimize,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)

    model_progress = []
    model_progress_path = os.path.join(args.out_dir, 'model_progress.csv')
    # optionally resume from a checkpoint
    # TODO: it would be best if resume load the architecture from this file
    # TODO: merge with which_architecture
    best_acc1 = 0
    if args.resume is not None:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            args.initial_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            if args.gpus is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpus)
                model = model.cuda(args.gpus)
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            if os.path.exists(model_progress_path):
                model_progress = np.loadtxt(model_progress_path, delimiter=',')
                model_progress = model_progress.tolist()
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    train_trans = []
    valid_trans = []
    both_trans = []
    if args.mosaic_pattern is not None:
        mosaic_trans = preprocessing.mosaic_transformation(args.mosaic_pattern)
        both_trans.append(mosaic_trans)

    if args.num_augmentations != 0:
        augmentations = preprocessing.random_augmentation(
            args.augmentation_settings, args.num_augmentations)
        train_trans.append(augmentations)

    target_size = default_configs.get_default_target_size(
        args.dataset, args.target_size)

    # loading the training set
    train_trans = [*both_trans, *train_trans]
    db_params = {
        'colour_space': args.colour_space,
        'vision_type': args.vision_type,
        'mask_image': args.mask_image
    }
    if args.dataset in ['imagenet', 'celeba', 'natural']:
        path_or_sample = args.data_dir
    else:
        path_or_sample = args.train_samples
    train_dataset = dataloader.train_set(args.dataset,
                                         target_size,
                                         mean,
                                         std,
                                         extra_transformation=train_trans,
                                         data_dir=path_or_sample,
                                         **db_params)
    if args.dataset == 'natural':
        train_dataset.num_crops = args.batch_size
        args.batch_size = 1

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    # loading validation set
    valid_trans = [*both_trans, *valid_trans]
    validation_dataset = dataloader.validation_set(
        args.dataset,
        target_size,
        mean,
        std,
        extra_transformation=valid_trans,
        data_dir=path_or_sample,
        **db_params)
    if args.dataset == 'natural':
        validation_dataset.num_crops = train_dataset.num_crops
        args.batch_size = 1

    val_loader = torch.utils.data.DataLoader(validation_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # training on epoch
    for epoch in range(args.initial_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        misc_utils.adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train_log = train_on_data(train_loader, model, criterion, optimizer,
                                  epoch, args)

        # evaluate on validation set
        validation_log = validate_on_data(val_loader, model, criterion, args)

        model_progress.append([*train_log, *validation_log])

        # remember best acc@1 and save checkpoint
        acc1 = validation_log[2]
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if misc_utils.is_saving_node(args.multiprocessing_distributed,
                                     args.rank, ngpus_per_node):
            misc_utils.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.network_name,
                    'customs': {
                        'pooling_type': args.pooling_type,
                        'in_chns': len(mean),
                        'num_classes': args.num_classes,
                        'blocks': args.blocks,
                        'num_kernels': args.num_kernels,
                        'kernel_size': args.kernel_size
                    },
                    'preprocessing': {
                        'mean': mean,
                        'std': std
                    },
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                    'target_size': target_size,
                },
                is_best,
                out_folder=args.out_dir)
            # TODO: get this header directly as a dictionary keys
            header = 'epoch,t_time,t_loss,t_top1,t_top5,v_time,v_loss,v_top1,v_top5'
            np.savetxt(model_progress_path,
                       np.array(model_progress),
                       delimiter=',',
                       header=header)
예제 #5
0
    start_time = datetime.datetime.fromtimestamp(start_stamp).strftime(
        '%Y-%m-%d_%H_%M_%S')
    print('Starting at: ' + start_time)

    args = arguments.common_train_arg_parser(sys.argv[1:])
    args = train_prominent_prepares(args)

    dataset_name = args.dataset.lower()
    network_name = args.network_name.lower()
    optimiser = args.optimiser.lower()

    # preparing directories
    args.save_dir = prepare_training.prepare_output_directories(
        dataset_name=dataset_name,
        network_name=network_name,
        optimiser=optimiser,
        load_weights=args.load_weights,
        experiment_name=args.experiment_name,
        framework='keras')

    # preparing the name of the model
    args.model_name = 'model_area_%s' % (args.area1layers)

    start_training_generator(args)

    finish_stamp = time.time()
    finish_time = datetime.datetime.fromtimestamp(finish_stamp).strftime(
        '%Y-%m-%d_%H-%M-%S')
    duration_time = (finish_stamp - start_stamp) / 60
    print('Finishing at: %s - Duration %.2f minutes.' %
          (finish_time, duration_time))
예제 #6
0
def main(args):
    args.gpus = set_visible_gpus(args.gpus)
    args.device = prepare_device(args.gpus)

    args.architecture_kwargs = {
        'in_chns': args.in_chns
    }
    # TODO: this is not a nice solution here
    if 'wavenet' in args.architecture:
        args.architecture_kwargs['inplanes'] = args.num_kernels
        args.architecture_kwargs['planes'] = args.blocks

    # creating the model
    model, architecture, mean_std = geetup_net.which_network(
        args.architecture, **args.architecture_kwargs
    )
    model = model.to(args.device)

    args.out_dir = prepare_training.prepare_output_directories(
        dataset_name='geetup_' + args.dataset, network_name=architecture,
        optimiser='sgd', load_weights=False,
        experiment_name=args.experiment_name, framework='pytorch'
    )

    logging.basicConfig(
        filename=args.out_dir + '/experiment_info.log', filemode='w',
        format='%(levelname)s: %(message)s', level=logging.INFO
    )

    validation_pickle = _get_multi_sensory_paths(
        args.data_dir, args.validation_file
    )
    validation_dataset = geetup_db.get_validation_dataset(
        validation_pickle, args.target_size, mean_std
    )
    validation_loader = torch.utils.data.DataLoader(
        validation_dataset, batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True
    )

    if args.random is not None:
        mean, std = get_preprocessing_function('rgb', 'trichromat')
        normalize_inverse = NormalizeInverse(mean, std)
        process_random_image(model, validation_loader, normalize_inverse, args)
        return

    args.criterion = nn.BCELoss().to(args.device)
    if args.evaluate:
        predict_outs = predict(
            validation_loader, model, args.criterion, args
        )
        for key, item in predict_outs.items():
            file_name = '_'.join(e for e in [key, *args.out_prefix])
            result_file = '%s/%s_%s' % (
                args.out_dir, file_name, args.validation_file
            )
            write_pickle(result_file, item)
        return

    training_pickle = _get_multi_sensory_paths(
        args.data_dir, args.train_file
    )
    train_dataset = geetup_db.get_train_dataset(
        training_pickle, args.target_size, mean_std, args.crop_scale,
        args.gaussian_sigma
    )
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers, pin_memory=True
    )

    # optimiser
    optimizer = torch.optim.SGD(
        model.parameters(), args.lr,
        momentum=args.momentum, weight_decay=args.weight_decay
    )

    epochs(model, train_loader, validation_loader, optimizer, args)