def train(save_path, model, lr_splitting_by=None, lrs=None, wd=0, lr=0.1, batch_size=128, n_epochs=100, weights=None, fb_method=False, callbacks=[], optimizer='sgd', scheduler=None, freeze_all_but_this_layer=None, mode='train'): # Create dynamically dataset generators train, valid, test, meta_data = get_chexnet_covid(batch_size=batch_size) # Create dynamically model model = models.__dict__[model]() summary(model) loss_function = torch.nn.BCELoss() if freeze_all_but_this_layer is not None: # First freeze all layers logger.info("Freezing all layers") for i, parameter in enumerate(model.parameters()): parameter.requires_grad = False # Unfreeze layers that matches for i, (name, parameter) in enumerate(model.named_parameters()): if name.startswith(freeze_all_but_this_layer): parameter.requires_grad = True logger.info("Unfreezing {}: {}".format(name, parameter.shape)) if optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd) elif optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd) if scheduler == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, n_epochs) if lr_splitting_by is not None: optimizer, _ = create_optimizer(optimizer, model, lr_splitting_by, lrs) # Create dynamically callbacks callbacks_constructed = [] for name in callbacks: clbk = get_callback(name, verbose=0) if clbk is not None: print(name) callbacks_constructed.append(clbk) # Pass everything to the training loop if train is not None: steps_per_epoch = len(train) else: steps_per_epoch = None target_indice = None if fb_method: target_indice = weights.index(1) if 1 in weights else 0 elif weights is not None: target_indice = 0 if mode == 'train': assert train is not None, "please provide train data" assert valid is not None, "please provide validation data" training_loop( model=model, optimizer=optimizer, scheduler=scheduler, loss_function=loss_function, metrics=[acc_chexnet_covid], train=train, valid=valid, test=test, meta_data=meta_data, steps_per_epoch=steps_per_epoch, n_epochs=n_epochs, save_path=save_path, config=_CONFIG, use_tb=True, custom_callbacks=callbacks_constructed, fb_method=fb_method, target_indice=target_indice, ) else: assert test is not None, "please provide test data for evaluation" evaluation_loop( model=model, optimizer=optimizer, loss_function=loss_function, metrics=[acc_chexnet_covid], test=test, meta_data=meta_data, save_path=save_path, config=_CONFIG, custom_callbacks=callbacks_constructed, target_indice=target_indice, )
def main(): setup_default_logging() args, args_text = _parse_args() args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank _logger.info('Training with a single process on %d GPUs.' % args.num_gpu) torch.manual_seed(args.seed + args.rank) # prepare model model = create_model(args.model, args.encoder, pretrained=args.pretrained, num_classes=args.num_classes, checkpoint_path=args.initial_checkpoint) # prepare optimizer optimizer = create_optimizer(args, model) # prepare scheduler lr_scheduler, num_epochs = create_scheduler(args, optimizer) _logger.info('Scheduled epochs: {}'.format(num_epochs)) # prepare dataset folder = args.data_folder train_fold = args.train_fold images = np.load(f'{folder}/images/fold{train_fold}_images.npy') masks = np.load(f'{folder}/masks/fold{train_fold}_masks.npy') types = np.load(f'{folder}/types/fold{train_fold}_types.npy') valid_fold = args.valid_fold images_val = np.load(f'{folder}/images/fold{valid_fold}_images.npy') masks_val = np.load(f'{folder}/masks/fold{valid_fold}_masks.npy') types_val = np.load(f'{folder}/types/fold{valid_fold}_types.npy') if args.no_aug: train_dataset = PanNukeDataset(images, masks, types, get_valid_transforms()) else: train_dataset = PanNukeDataset(images, masks, types, get_training_trasnforms(args.aug_type)) val_dataset = PanNukeDataset(images_val, masks_val, types_val, get_valid_transforms()) loaders = { 'train': DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, shuffle=True), 'valid': DataLoader(val_dataset, batch_size=args.batch_size * args.validation_batch_size_multiplier, num_workers=args.workers, pin_memory=True, shuffle=False) } # save config output_dir = '' output_base = args.output if args.output else './logs' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, args.encoder, args.aug_type, args.opt.lower() ]) output_dir = get_outdir(output_base, 'train', exp_name) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) criterion, criterion_names = create_criterion(args) callbacks = create_callbacks(args, criterion_names) eval_metric = args.eval_metric minimize_metric = True if eval_metric == 'loss' else False runner = SupervisedRunner(input_key=args.input_key, input_target_key=args.input_target_key) # set fp16 if args.fp16: fp16_params = dict(opt_level="O1") # params for FP16 _logger.info('Using fp16 O1') else: fp16_params = None _logger.info('Not using fp16 O1') runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, loaders=loaders, callbacks=callbacks, logdir=output_dir, num_epochs=num_epochs, main_metric=eval_metric, minimize_metric=minimize_metric, verbose=True, fp16=fp16_params, )
def main(): # parse command line argument and generate config dictionary config = parse_args() logger.info(json.dumps(config, indent=2)) run_config = config['run_config'] optim_config = config['optim_config'] # Code for saving in the correct place all_arguments = {} for key in config.keys(): all_arguments.update(config[key]) run_config['save_name'] = run_config['save_name'].format(**all_arguments) print('Saving in ' + run_config['save_name']) # End code for saving in the right place if run_config['test_config']: sys.exit(0) # TensorBoard SummaryWriter if run_config['tensorboard']: writer = SummaryWriter(run_config['outdir']) else: writer = None # create output directory outdir = pathlib.Path(run_config['outdir']) outdir.mkdir(exist_ok=True, parents=True) # save config as json file in output directory outpath = outdir / 'config.json' with open(outpath, 'w') as fout: json.dump(config, fout, indent=2) # load data loaders train_loader, test_loader = get_loader(config['data_config']) # set random seed (this was moved after the data loading because the data # loader might have a random seed) seed = run_config['seed'] torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2, size=optim_config['epochs']) # load model logger.info('Loading model...') model = utils.load_model(config['model_config']) n_params = sum([param.view(-1).size()[0] for param in model.parameters()]) logger.info('n_params: {}'.format(n_params)) if run_config['count_params']: # this option means just count the number of parameters, then move on sys.exit(0) if run_config['fp16'] and not run_config['use_amp']: model.half() for layer in model.modules(): if isinstance(layer, nn.BatchNorm2d): layer.float() device = torch.device(run_config['device']) if device.type == 'cuda' and torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) logger.info('Done') train_criterion, test_criterion = utils.get_criterion( config['data_config']) # create optimizer if optim_config['no_weight_decay_on_bn']: params = [ { 'params': [ param for name, param in model.named_parameters() if 'bn' not in name ] }, { 'params': [ param for name, param in model.named_parameters() if 'bn' in name ], 'weight_decay': 0 }, ] else: params = model.parameters() optim_config['steps_per_epoch'] = len(train_loader) optimizer, scheduler = utils.create_optimizer(params, optim_config) # for mixed-precision amp_handle = apex.amp.init( enabled=run_config['use_amp']) if is_apex_available else None # run test before start training if run_config['test_first']: test(0, model, test_criterion, test_loader, run_config, writer) state = { 'config': config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'accuracy': 0, 'best_accuracy': 0, 'best_epoch': 0, } epoch_logs = [] for epoch, seed in zip(range(1, optim_config['epochs'] + 1), epoch_seeds): np.random.seed(seed) # train train_log = train(epoch, model, optimizer, scheduler, train_criterion, train_loader, config, writer, amp_handle) epoch_log = train_log.copy() epoch_logs.append(epoch_log) utils.save_epoch_logs(epoch_logs, outdir) """ Upload to bucket code """ from google.cloud import storage import os client = storage.Client() bucket = client.get_bucket('ramasesh-bucket-1') filenames = os.listdir(outdir) for filename in filenames: print('Processing file: ' + filename) blob = bucket.blob(run_config['save_name'] + filename) blob.upload_from_filename(str(outdir) + '/' + filename) """
def main(argv): config = parse_args() logging.info(json.dumps(config, indent=2)) if FLAGS.debug: print('non-flag arguments:', argv) return reporters.save_config(config) # set up reporting data_store = {} reporter = reporters.build_reporters(config['save_config'], data_store) prefixes = ['test', 'train', 'model_measurements'] reporter = reporters.prefix_reporters(reporter, prefixes) loaders = get_loader(config['data_config']) train_loader, test_loader, single_train_loader, single_test_loader = loaders config['optim_config']['steps_per_epoch'] = len(train_loader) train_setup.set_reproducible(seed=config['run_config']['seed']) logging.info('Loading model...') model = utils.load_model(config['model_config']) initial_parameters = copy.deepcopy(list(model.parameters())) initial_named_parameters = copy.deepcopy(list(model.named_parameters())) device = torch.device(config['run_config']['device']) if device.type == 'cuda' and torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) logging.info('Done') train_criterion = nn.CrossEntropyLoss(reduction='mean') test_criterion = nn.CrossEntropyLoss(reduction='mean') # run test before start training epoch = -1 logging.info('Initial evaluation') test_log = test(model, test_criterion, test_loader, config['run_config']) reporter['test'].report_all(epoch, test_log) model_dicts = {} optimizer, scheduler = utils.create_optimizer(model.parameters(), config['optim_config']) logging.info('Beginning training') for epoch in range(config['optim_config']['epochs']): train_log = train(model, optimizer, scheduler, train_criterion, train_loader, config['run_config']) reporter[f'train'].report_all(epoch, train_log) test_log = test(model, test_criterion, test_loader, config['run_config']) reporter[f'test'].report_all(epoch, test_log) if should_measure(epoch, config): model_measurements = make_measurements(model, config, loaders, initial_parameters, initial_named_parameters) reporter['model_measurements'].report_all(epoch, model_measurements) reporters.save_dict(config, model_dicts, 'model_parameters')