示例#1
0
def train(save_path,
          model,
          lr_splitting_by=None,
          lrs=None,
          wd=0,
          lr=0.1,
          batch_size=128,
          n_epochs=100,
          weights=None,
          fb_method=False,
          callbacks=[],
          optimizer='sgd',
          scheduler=None,
          freeze_all_but_this_layer=None,
          mode='train'):
    # Create dynamically dataset generators
    train, valid, test, meta_data = get_chexnet_covid(batch_size=batch_size)

    # Create dynamically model
    model = models.__dict__[model]()
    summary(model)

    loss_function = torch.nn.BCELoss()

    if freeze_all_but_this_layer is not None:
        # First freeze all layers
        logger.info("Freezing all layers")
        for i, parameter in enumerate(model.parameters()):
            parameter.requires_grad = False

        # Unfreeze layers that matches

        for i, (name, parameter) in enumerate(model.named_parameters()):
            if name.startswith(freeze_all_but_this_layer):
                parameter.requires_grad = True
                logger.info("Unfreezing {}: {}".format(name, parameter.shape))

    if optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd)
    elif optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=lr,
                                     weight_decay=wd)

    if scheduler == 'cosine':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, n_epochs)

    if lr_splitting_by is not None:
        optimizer, _ = create_optimizer(optimizer, model, lr_splitting_by, lrs)

    # Create dynamically callbacks
    callbacks_constructed = []
    for name in callbacks:
        clbk = get_callback(name, verbose=0)
        if clbk is not None:
            print(name)
            callbacks_constructed.append(clbk)

    # Pass everything to the training loop
    if train is not None:
        steps_per_epoch = len(train)
    else:
        steps_per_epoch = None

    target_indice = None
    if fb_method:
        target_indice = weights.index(1) if 1 in weights else 0
    elif weights is not None:
        target_indice = 0

    if mode == 'train':
        assert train is not None, "please provide train data"
        assert valid is not None, "please provide validation data"
        training_loop(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            loss_function=loss_function,
            metrics=[acc_chexnet_covid],
            train=train,
            valid=valid,
            test=test,
            meta_data=meta_data,
            steps_per_epoch=steps_per_epoch,
            n_epochs=n_epochs,
            save_path=save_path,
            config=_CONFIG,
            use_tb=True,
            custom_callbacks=callbacks_constructed,
            fb_method=fb_method,
            target_indice=target_indice,
        )
    else:
        assert test is not None, "please provide test data for evaluation"
        evaluation_loop(
            model=model,
            optimizer=optimizer,
            loss_function=loss_function,
            metrics=[acc_chexnet_covid],
            test=test,
            meta_data=meta_data,
            save_path=save_path,
            config=_CONFIG,
            custom_callbacks=callbacks_constructed,
            target_indice=target_indice,
        )
def main():
    setup_default_logging()
    args, args_text = _parse_args()
    args.device = 'cuda:0'
    args.world_size = 1
    args.rank = 0  # global rank
    _logger.info('Training with a single process on %d GPUs.' % args.num_gpu)
    torch.manual_seed(args.seed + args.rank)

    # prepare model
    model = create_model(args.model,
                         args.encoder,
                         pretrained=args.pretrained,
                         num_classes=args.num_classes,
                         checkpoint_path=args.initial_checkpoint)

    # prepare optimizer
    optimizer = create_optimizer(args, model)

    # prepare scheduler
    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
    _logger.info('Scheduled epochs: {}'.format(num_epochs))

    # prepare dataset
    folder = args.data_folder
    train_fold = args.train_fold
    images = np.load(f'{folder}/images/fold{train_fold}_images.npy')
    masks = np.load(f'{folder}/masks/fold{train_fold}_masks.npy')
    types = np.load(f'{folder}/types/fold{train_fold}_types.npy')

    valid_fold = args.valid_fold
    images_val = np.load(f'{folder}/images/fold{valid_fold}_images.npy')
    masks_val = np.load(f'{folder}/masks/fold{valid_fold}_masks.npy')
    types_val = np.load(f'{folder}/types/fold{valid_fold}_types.npy')
    if args.no_aug:
        train_dataset = PanNukeDataset(images, masks, types,
                                       get_valid_transforms())
    else:
        train_dataset = PanNukeDataset(images, masks, types,
                                       get_training_trasnforms(args.aug_type))
    val_dataset = PanNukeDataset(images_val, masks_val, types_val,
                                 get_valid_transforms())

    loaders = {
        'train':
        DataLoader(train_dataset,
                   batch_size=args.batch_size,
                   num_workers=args.workers,
                   pin_memory=True,
                   shuffle=True),
        'valid':
        DataLoader(val_dataset,
                   batch_size=args.batch_size *
                   args.validation_batch_size_multiplier,
                   num_workers=args.workers,
                   pin_memory=True,
                   shuffle=False)
    }

    # save config
    output_dir = ''
    output_base = args.output if args.output else './logs'
    exp_name = '-'.join([
        datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, args.encoder,
        args.aug_type,
        args.opt.lower()
    ])
    output_dir = get_outdir(output_base, 'train', exp_name)

    with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
        f.write(args_text)

    criterion, criterion_names = create_criterion(args)
    callbacks = create_callbacks(args, criterion_names)
    eval_metric = args.eval_metric
    minimize_metric = True if eval_metric == 'loss' else False
    runner = SupervisedRunner(input_key=args.input_key,
                              input_target_key=args.input_target_key)
    # set fp16
    if args.fp16:
        fp16_params = dict(opt_level="O1")  # params for FP16
        _logger.info('Using fp16 O1')
    else:
        fp16_params = None
        _logger.info('Not using fp16 O1')
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=lr_scheduler,
        loaders=loaders,
        callbacks=callbacks,
        logdir=output_dir,
        num_epochs=num_epochs,
        main_metric=eval_metric,
        minimize_metric=minimize_metric,
        verbose=True,
        fp16=fp16_params,
    )
示例#3
0
def main():
    # parse command line argument and generate config dictionary
    config = parse_args()
    logger.info(json.dumps(config, indent=2))

    run_config = config['run_config']
    optim_config = config['optim_config']

    # Code for saving in the correct place
    all_arguments = {}
    for key in config.keys():
        all_arguments.update(config[key])

    run_config['save_name'] = run_config['save_name'].format(**all_arguments)
    print('Saving in ' + run_config['save_name'])
    # End code for saving in the right place

    if run_config['test_config']:
        sys.exit(0)

    # TensorBoard SummaryWriter
    if run_config['tensorboard']:
        writer = SummaryWriter(run_config['outdir'])
    else:
        writer = None

    # create output directory
    outdir = pathlib.Path(run_config['outdir'])
    outdir.mkdir(exist_ok=True, parents=True)

    # save config as json file in output directory
    outpath = outdir / 'config.json'
    with open(outpath, 'w') as fout:
        json.dump(config, fout, indent=2)

    # load data loaders
    train_loader, test_loader = get_loader(config['data_config'])

    # set random seed (this was moved after the data loading because the data
    # loader might have a random seed)
    seed = run_config['seed']
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2,
                                    size=optim_config['epochs'])

    # load model
    logger.info('Loading model...')
    model = utils.load_model(config['model_config'])
    n_params = sum([param.view(-1).size()[0] for param in model.parameters()])
    logger.info('n_params: {}'.format(n_params))

    if run_config['count_params']:
        # this option means just count the number of parameters, then move on
        sys.exit(0)

    if run_config['fp16'] and not run_config['use_amp']:
        model.half()
        for layer in model.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.float()

    device = torch.device(run_config['device'])
    if device.type == 'cuda' and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)
    logger.info('Done')

    train_criterion, test_criterion = utils.get_criterion(
        config['data_config'])

    # create optimizer
    if optim_config['no_weight_decay_on_bn']:
        params = [
            {
                'params': [
                    param for name, param in model.named_parameters()
                    if 'bn' not in name
                ]
            },
            {
                'params': [
                    param for name, param in model.named_parameters()
                    if 'bn' in name
                ],
                'weight_decay':
                0
            },
        ]
    else:
        params = model.parameters()
    optim_config['steps_per_epoch'] = len(train_loader)
    optimizer, scheduler = utils.create_optimizer(params, optim_config)

    # for mixed-precision
    amp_handle = apex.amp.init(
        enabled=run_config['use_amp']) if is_apex_available else None

    # run test before start training
    if run_config['test_first']:
        test(0, model, test_criterion, test_loader, run_config, writer)

    state = {
        'config': config,
        'state_dict': None,
        'optimizer': None,
        'epoch': 0,
        'accuracy': 0,
        'best_accuracy': 0,
        'best_epoch': 0,
    }
    epoch_logs = []
    for epoch, seed in zip(range(1, optim_config['epochs'] + 1), epoch_seeds):
        np.random.seed(seed)
        # train
        train_log = train(epoch, model, optimizer, scheduler, train_criterion,
                          train_loader, config, writer, amp_handle)

        epoch_log = train_log.copy()
        epoch_logs.append(epoch_log)
        utils.save_epoch_logs(epoch_logs, outdir)
    """
    Upload to bucket code
    """

    from google.cloud import storage
    import os

    client = storage.Client()
    bucket = client.get_bucket('ramasesh-bucket-1')
    filenames = os.listdir(outdir)

    for filename in filenames:
        print('Processing file: ' + filename)

        blob = bucket.blob(run_config['save_name'] + filename)
        blob.upload_from_filename(str(outdir) + '/' + filename)
    """
示例#4
0
def main(argv):
    config = parse_args()
    logging.info(json.dumps(config, indent=2))

    if FLAGS.debug:
        print('non-flag arguments:', argv)
        return

    reporters.save_config(config)

    # set up reporting
    data_store = {}
    reporter = reporters.build_reporters(config['save_config'], data_store)
    prefixes = ['test', 'train', 'model_measurements']
    reporter = reporters.prefix_reporters(reporter, prefixes)

    loaders = get_loader(config['data_config'])
    train_loader, test_loader, single_train_loader, single_test_loader = loaders
    config['optim_config']['steps_per_epoch'] = len(train_loader)

    train_setup.set_reproducible(seed=config['run_config']['seed'])

    logging.info('Loading model...')
    model = utils.load_model(config['model_config'])
    initial_parameters = copy.deepcopy(list(model.parameters()))
    initial_named_parameters = copy.deepcopy(list(model.named_parameters()))
    device = torch.device(config['run_config']['device'])
    if device.type == 'cuda' and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)
    logging.info('Done')

    train_criterion = nn.CrossEntropyLoss(reduction='mean')
    test_criterion = nn.CrossEntropyLoss(reduction='mean')

    # run test before start training
    epoch = -1
    logging.info('Initial evaluation')
    test_log = test(model, test_criterion, test_loader, config['run_config'])
    reporter['test'].report_all(epoch, test_log)

    model_dicts = {}

    optimizer, scheduler = utils.create_optimizer(model.parameters(),
                                                  config['optim_config'])

    logging.info('Beginning training')

    for epoch in range(config['optim_config']['epochs']):
        train_log = train(model, optimizer, scheduler, train_criterion,
                          train_loader, config['run_config'])
        reporter[f'train'].report_all(epoch, train_log)

        test_log = test(model, test_criterion, test_loader,
                        config['run_config'])
        reporter[f'test'].report_all(epoch, test_log)

        if should_measure(epoch, config):
            model_measurements = make_measurements(model, config, loaders,
                                                   initial_parameters,
                                                   initial_named_parameters)

            reporter['model_measurements'].report_all(epoch,
                                                      model_measurements)

    reporters.save_dict(config, model_dicts, 'model_parameters')