def run_training(config): tb_writer = CustomWriter(config) logger = saver.get_logger(config) num_epochs = config.training.num_epochs model = {'G': models.get_model(config, tag='G'), 'D': models.get_model(config, tag='D')} model = {key: torch.nn.DataParallel(value) for key, value in model.items()} device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_loader = get_DataLoader(config, phase="train") val_loader = get_DataLoader(config, phase="val") optimizer = {'G': optimizers.get_optimizer(model['G'].parameters(), config.model.G.optimizer), 'D': optimizers.get_optimizer(model['D'].parameters(), config.model.D.optimizer)} scheduler = {'G': schedulers.get_scheduler(optimizer['G'], config.model.G.scheduler), 'D': schedulers.get_scheduler(optimizer['D'], config.model.D.scheduler)} criterion = {'G': losses.get_loss(config.model.G.criterion), 'D': losses.get_loss(config.model.D.criterion)} start_epoch_num = ( saver.get_latest_epoch_num(config) if config.model.G.load_state == -1 or config.model.G.load_state == "latest" else config.model.G.load_state ) # Dynamic imports according to protocols epoch_module = importlib.import_module('protocols.{}.epoch'.format(config.protocol)) train_one_epoch, test = getattr(epoch_module, 'train_one_epoch'), getattr(epoch_module, 'test') for epoch in range(start_epoch_num + 1, start_epoch_num + num_epochs + 1): train_buffer = train_one_epoch( config=config, model=model, device=device, train_loader=train_loader, optimizer=optimizer, scheduler=scheduler, criterion=criterion, epoch=epoch, logger=logger, log_interval=config.training.log_interval, ) tb_writer.write_result(train_buffer, epoch, phase="train") if epoch % config.training.validation_period == 0: val_buffer = test( config=config, model=model, device=device, test_loader=val_loader, criterion=criterion, logger=logger, phase="val", tag=epoch, log_interval=8, ) tb_writer.write_result(val_buffer, epoch, phase="val")
def configure_optimizers(self): optim = get_optimizer(self.optim_params['name'])( self.parameters(), **self.optim_params['params']) if isinstance(self.sched_params, dict): sched = get_scheduler(self.sched_params['name'])( optim, **self.sched_params['params']) else: sched = [] return [optim], sched
def get_optimizer(self, model): """Initialize the SCAFFOLD optimizer.""" optimizer = optimizers.get_optimizer(model) optimizer.server_update_direction = self.server_update_direction optimizer.client_update_direction = self.client_update_direction optimizer.client_id = self.client_id optimizer.update_flag = True return optimizer
def setUp(self): super().setUp() __ = Config() fields = [ 'optimizer', 'lr_schedule', 'learning_rate', 'momentum', 'weight_decay', 'lr_gamma', 'lr_milestone_steps', 'lr_warmup_steps' ] params = ['SGD', '', 0.1, 0.5, 0.0, 0.0, '', ''] Config().trainer = namedtuple('trainer', fields)(*params) self.model = models_registry.get('resnet_18') self.optimizer = optimizers.get_optimizer(self.model)
def init_optimizer(self): """Initialize the optimizer.""" optimizer_cls = get_optimizer(self.params[TRAIN]) optimizer_params = {k: v for k, v in self.params[TRAIN][OPTIMIZER].items() if k != "name"} criterion_params = list(self.criterion.parameters()) if criterion_params is not None: model_params = list(self.model.parameters()) + criterion_params else: model_params = self.model.parameters() optimizer = optimizer_cls(model_params, **optimizer_params) return optimizer
def get_optimizer(self, model): """Initialize the FedSarah optimizer.""" optimizer = optimizers.get_optimizer(model) optimizer.server_control_variates = self.server_control_variates optimizer.client_control_variates = self.client_control_variates optimizer.client_id = self.client_id optimizer.max_counter = Config().trainer.epochs if self.adjustment: optimizer.epsilon = optimizer.max_epsilon - ( optimizer.max_epsilon - optimizer.min_epsilon) * np.exp( -1 * optimizer.epsilon_decay * self.fl_round_counter) #optimizer.epsilon = optimizer.min_epsilon + ( #optimizer.max_epsilon - optimizer.min_epsilon) * np.exp( # -1 * optimizer.epsilon_decay * self.fl_round_counter) else: optimizer.epsilon = optimizer.min_epsilon return optimizer
def main(): """Main function""" # Initialization args = parse_args() dist = init_workers(args.distributed) config = load_config(args) os.makedirs(config['output_dir'], exist_ok=True) config_logging(verbose=args.verbose) logging.info('Initialized rank %i size %i local_rank %i local_size %i', dist.rank, dist.size, dist.local_rank, dist.local_size) if dist.rank == 0: logging.info('Configuration: %s', config) # Setup MLPerf logging if args.mlperf: mllogger = configure_mllogger(config['output_dir']) if dist.rank == 0 and args.mlperf: mllogger.event(key=mllog.constants.CACHE_CLEAR) mllogger.start(key=mllog.constants.INIT_START) # Initialize Weights & Biases logging if args.wandb and dist.rank == 0: import wandb wandb.init(project='cosmoflow', name=args.run_tag, id=args.run_tag, config=config, resume=args.run_tag) # Device and session configuration gpu = dist.local_rank if args.rank_gpu else None if gpu is not None: logging.info('Taking gpu %i', gpu) configure_session(gpu=gpu, intra_threads=args.intra_threads, inter_threads=args.inter_threads, kmp_blocktime=args.kmp_blocktime, kmp_affinity=args.kmp_affinity, omp_num_threads=args.omp_num_threads) # Mixed precision if args.amp: logging.info('Enabling mixed float16 precision') # Suggested bug workaround from https://github.com/tensorflow/tensorflow/issues/38516 if tf.__version__.startswith('2.2.'): from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check device_compatibility_check.log_device_compatibility_check = lambda policy_name, skip_local: None tf.keras.mixed_precision.experimental.set_policy('mixed_float16') # TF 2.3 #tf.keras.mixed_precision.set_global_policy('mixed_float16') # Start MLPerf logging if dist.rank == 0 and args.mlperf: log_submission_info(**config.get('mlperf', {})) mllogger.end(key=mllog.constants.INIT_STOP) mllogger.start(key=mllog.constants.RUN_START) # Load the data data_config = config['data'] if dist.rank == 0: logging.info('Loading data') datasets = get_datasets(dist=dist, **data_config) logging.debug('Datasets: %s', datasets) # Construct or reload the model if dist.rank == 0: logging.info('Building the model') train_config = config['train'] initial_epoch = 0 checkpoint_format = os.path.join(config['output_dir'], 'checkpoint-{epoch:03d}.h5') if args.resume and os.path.exists(checkpoint_format.format(epoch=1)): # Reload model from last checkpoint initial_epoch, model = reload_last_checkpoint( checkpoint_format, data_config['n_epochs'], distributed=args.distributed) else: # Build a new model model = get_model(**config['model']) # Configure the optimizer opt = get_optimizer(distributed=args.distributed, **config['optimizer']) # Compile the model model.compile(optimizer=opt, loss=train_config['loss'], metrics=train_config['metrics']) if dist.rank == 0: model.summary() # Save configuration to output directory if dist.rank == 0: config['n_ranks'] = dist.size save_config(config) # Prepare the callbacks if dist.rank == 0: logging.info('Preparing callbacks') callbacks = [] if args.distributed: # Broadcast initial variable states from rank 0 to all processes. callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) # Average metrics across workers callbacks.append(hvd.callbacks.MetricAverageCallback()) # Learning rate decay schedule if 'lr_schedule' in config: global_batch_size = data_config['batch_size'] * dist.size callbacks.append( tf.keras.callbacks.LearningRateScheduler( get_lr_schedule(global_batch_size=global_batch_size, **config['lr_schedule']))) # Timing timing_callback = TimingCallback() callbacks.append(timing_callback) # Checkpointing and logging from rank 0 only if dist.rank == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format)) callbacks.append( tf.keras.callbacks.CSVLogger(os.path.join(config['output_dir'], 'history.csv'), append=args.resume)) if args.tensorboard: callbacks.append( tf.keras.callbacks.TensorBoard( os.path.join(config['output_dir'], 'tensorboard'))) if args.mlperf: callbacks.append(MLPerfLoggingCallback()) if args.wandb: callbacks.append(wandb.keras.WandbCallback()) # Early stopping patience = train_config.get('early_stopping_patience', None) if patience is not None: callbacks.append( tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=patience, verbose=1)) # Stopping at specified target target_mae = train_config.get('target_mae', None) callbacks.append(StopAtTargetCallback(target_max=target_mae)) if dist.rank == 0: logging.debug('Callbacks: %s', callbacks) # Train the model if dist.rank == 0: logging.info('Beginning training') fit_verbose = 1 if (args.verbose and dist.rank == 0) else 2 model.fit(datasets['train_dataset'], steps_per_epoch=datasets['n_train_steps'], epochs=data_config['n_epochs'], validation_data=datasets['valid_dataset'], validation_steps=datasets['n_valid_steps'], callbacks=callbacks, initial_epoch=initial_epoch, verbose=fit_verbose) # Stop MLPerf timer if dist.rank == 0 and args.mlperf: mllogger.end(key=mllog.constants.RUN_STOP, metadata={'status': 'success'}) # Print training summary if dist.rank == 0: print_training_summary(config['output_dir'], args.print_fom) # Print GPU memory - not supported in TF 2.2? #if gpu is not None: # device = tf.config.list_physical_devices('GPU')[gpu] # #print(tf.config.experimental.get_memory_usage(device)) # #print(tf.config.experimental.get_memory_info(device)) # Finalize if dist.rank == 0: logging.info('All done!')
def get_params(args): np.set_printoptions(precision=3, suppress=True) global model_name, data_name, model_type, norm_type, acti_type global pretrain_augment, train_augment global num_cluster, embeded_dim, silent, slevel, to_disk, flevel, time_stamp, log_file global save_model_dir, print_every, buffer_size, batch_size, alpha global pretrain_lr, pretrain_optimizer_type global pretrain_epoch, train_lr, train_optimizer_type, maxiter, interval global cluster_loss_type, reconstruct_loss_type global num_repeat_kmeans, use_pretrain_model, save_pretrain_model, delta global re, cre, cl, dccbc # ==================== some other hyper-parameters =================== model_name = args.model_name # "dec" data_name = args.data_name # "mnist" model_type = args.model_type # "conv" or "all_conv" or "mlp" norm_type = args.norm_type if args.norm_type is not "None" else None # None or "bn" or "normal" acti_type = args.acti_type # "relu" pretrain_augment = args.pretrain_augment # True train_augment = args.train_augment # True model_name = "_".join([model_name, model_type]) # "dec_mnist" num_cluster = dataset_clusters[data_name] if args.num_cluster < 0 else args.num_cluster # 10 embeded_dim = num_cluster if args.embeded_dim < 0 else args.embeded_dim # 10 silent = args.silent # False slevel = args.slevel # "debug" or "info" to_disk = args.to_disk # True flevel = args.flevel # "debug" or "info" time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") log_file = f"results/log/{model_name}_{data_name}_{time_stamp}.txt" save_model_dir = "results/model/" print_every = args.print_every # 10 buffer_size = args.buffer_size # 70000 batch_size = args.batch_size # 256 alpha = args.alpha # 1 pretrain_lr = args.pretrain_lr # 0.001 pretrain_optimizer_type = args.pretrain_optimizer_type # "rmsprop" pretrain_epoch = args.pretrain_epoch # 100 train_lr = args.train_lr # 0.001 train_optimizer_type = args.train_optimizer_type # "rmsprop" maxiter = args.maxiter interval = args.interval cluster_loss_type = args.cluster_loss_type # "kl" reconstruct_loss_type = args.reconstruct_loss_type # "l2" num_repeat_kmeans = args.num_repeat_kmeans # 20 use_pretrain_model = args.use_pretrain_model # False save_pretrain_model = args.save_pretrain_model # True delta = args.delta # 0.001 re, cre, cl, dccbc = args.re, args.cre, args.cl, args.dccbc # ==================== some settings ==================== global logger, feature, label, train_dataset, test_dataset, transformer global sample_x, sample_y, pretrain_optimizer, train_optimizer global encoder, decoder, cluster_layer, model, cluster_loss_fn, reconstruct_loss_fn global encoder_path, decoder_path logger = Logger(silent=silent, slevel=slevel, to_disk=to_disk, log_file=log_file, flevel=flevel) feature, label = load_merged_data(data_name) feature = feature / 255.0 dataset = tf.data.Dataset.from_tensor_slices({"feature": feature, "label": label, "idx": np.arange(label.shape[0])}) train_dataset = dataset.shuffle(buffer_size).batch(batch_size) test_dataset = dataset.batch(batch_size) transformer = Transformer(data_name, batch_size) sample_x, sample_y = load_sample_data(data_name, 100) sample_x = sample_x / 255. pretrain_optimizer = get_optimizer(type=pretrain_optimizer_type, learning_rate=pretrain_lr) train_optimizer = get_optimizer(type=train_optimizer_type, learning_rate=train_lr) pretrain_augment_string = "augment" if pretrain_augment else "no_augment" encoder_path = "_".join([data_name, model_type, pretrain_augment_string, "encoder.h5"]) encoder_path = os.path.join(save_model_dir, encoder_path) decoder_path = "_".join([data_name, model_type, pretrain_augment_string, "decoder.h5"]) decoder_path = os.path.join(save_model_dir, decoder_path) if use_pretrain_model: logger.info(f"Use pretrained model: {encoder_path}, {decoder_path}") encoder = tf.keras.models.load_model(encoder_path) decoder = tf.keras.models.load_model(decoder_path) pretrain_epoch = 0 else: logger.info("Warning: train Autoencoder from scratch") encoder, decoder = get_backbone(data_name, embeded_dim=embeded_dim, model_type=model_type, norm_type=norm_type, acti_type=acti_type) cluster_layer = ClusterLayer(num_cluster, alpha) model = {"encoder": encoder, "decoder": decoder, "cluster_layer": cluster_layer} cluster_loss_fn = losses[cluster_loss_type] reconstruct_loss_fn = losses[reconstruct_loss_type]
def run_training( experiment_name, debug=False, only_ees=False, only_kinematics=False, use_neptune=False, epochs=2000, # 20000 train_batch=21330, # 54726 // 2, val_batch=1123, # 2000 dtype=np.float32, val_split=0.05, shuffle_data=True, model_type="linear", # "GRU", # 'transformer', model_cfg=None, bptt=350, hidden_size=6, lr=1e-2, start_trim=700, log_interval=5, val_interval=20, clip_grad_norm=False, output_dir="results", normalize_input=True, optimizer="Adam", # "AdamW", scheduler=None, # "StepLR", train_weight=100., batch_first=True, toss_allzero_mn=True, dumb_augment=False, score="pearson", metric="l2"): # pearson """Run training and validation.""" if use_neptune and NEPTUNE_IMPORTED: neptune.init("Serre-Lab/deepspine") if experiment_name is None: experiment_name = "synthetic_data" neptune.create_experiment(experiment_name) assert model_type is not None, "You must select a model." default_model_params = tools.get_model_defaults() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") timestamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H_%M_%S') if model_cfg is None: print("Using default model cfg file.") model_cfg = model_type data = np.load(DATA_FILE) mn = data["mn"] ees = data["ees"] kinematics = data["kinematics"] X = torch.from_numpy(np.concatenate((ees, kinematics), 1).astype(dtype)) Y = torch.from_numpy(mn.astype(dtype)) X = X.permute(0, 2, 1) Y = Y.permute(0, 2, 1) if only_ees: X = X[..., 0][..., None] # Only ees -- 0.73 if only_kinematics: X = X[..., 1:] # Only kinematics -- 0.89 input_size = X.size(-1) output_size = Y.size(-1) meta = Meta( batch_first=batch_first, data_size=X.shape, train_batch=train_batch, val_batch=val_batch, val_split=val_split, model_type=model_type, model_cfg=model_cfg, input_size=input_size, hidden_size=hidden_size, output_size=output_size, metric=metric, score=score, normalize_input=normalize_input, lr=lr, bptt=bptt, epochs=epochs, optimizer=optimizer, scheduler=scheduler, clip_grad_norm=clip_grad_norm, log_interval=log_interval, val_interval=val_interval, start_trim=start_trim, train_weight=train_weight, device=device) # Prepare data if toss_allzero_mn: # Restrict to nonzero mn fibers # mask = (Y.sum(1) > 127.5).sum(-1) == 2 # Ys where both are nonzero at some point mask = ((Y > 200).sum(1) > 0).sum(-1) == 2 # Ys where both are > 127.5 at some point # mask = ((Y > 127.5).sum(1) > 0).sum(-1) >= 1 # Ys where either is > 127.5 at some point print("Throwing out {} examples.".format((mask == False).sum())) X = X[mask] Y = Y[mask] if meta.start_trim: X = X.narrow(1, meta.start_trim, X.size(1) - meta.start_trim) Y = Y.narrow(1, meta.start_trim, Y.size(1) - meta.start_trim) if shuffle_data: idx = np.random.permutation(len(X)) X = X[idx] Y = Y[idx] if meta.normalize_input: # X = (X - 127.5) / 127.5 # Y = (Y - 127.5) / 127.5 k_X = X[..., 1:] k_X = (k_X - k_X.mean(1, keepdim=True)) / (k_X.std(1, keepdim=True) + 1e-8) # This is peaking but whatever... e_X = X[..., 0][..., None] e_X = e_X / 255. X = torch.cat((k_X, e_X), -1) if meta.metric != "bce": Y = (Y - Y.mean(1, keepdim=True)) / (Y.std(1, keepdim=True) + 1e-8) # Y = Y / 255. else: # Quantize Y Y = (Y > 127.5).float() X = X.to(meta.device) Y = Y.to(meta.device) cv_idx = np.arange(len(X)) cv_idx = cv_idx > np.round(float(len(X)) * val_split).astype(int) X_train = X[cv_idx] Y_train = Y[cv_idx] X_val = X[~cv_idx] Y_val = Y[~cv_idx] assert meta.train_batch < len(X_train), "Train batch size > dataset size {}.".format(len(X_train) - 1) assert meta.val_batch < len(X_val), "Val batch size > dataset size {}.".format(len(X_val) - 1) if dumb_augment: X_train = torch.cat((X_train, X_train[:, torch.arange(X_train.size(1) - 1, -1, -1).long()])) Y_train = torch.cat((Y_train, Y_train[:, torch.arange(Y_train.size(1) - 1, -1, -1).long()])) if not meta.batch_first: X_train = X_train.permute(1, 0, 2) Y_train = Y_train.permute(1, 0, 2) X_val = X_val.permute(1, 0, 2) Y_val = Y_val.permute(1, 0, 2) # Create model model = modeling.create_model( batch_first=meta.batch_first, bptt=meta.bptt, model_type=meta.model_type, model_cfg=meta.model_cfg, input_size=meta.input_size, hidden_size=meta.hidden_size, output_size=meta.output_size, default_model_params=default_model_params, device=meta.device) num_params = sum([p.numel() for p in model.parameters() if p.requires_grad]) print('Total number of parameters: {}'.format(num_params)) score, criterion = metrics.get_metric(metric, meta.batch_first) optimizer_fun = optimizers.get_optimizer(optimizer) assert lr < 1, "LR is greater than 1." if "adam" in optimizer.lower(): optimizer = optimizer_fun(model.parameters(), lr=lr, amsgrad=True) else: optimizer = optimizer_fun(model.parameters(), lr=lr) if scheduler is not None: scheduler = optimizers.get_scheduler(scheduler) scheduler = scheduler(optimizer) # Start training best_val_loss = float("inf") best_model = None X_val, _ = batchify(X_val, bsz=meta.val_batch, random=False, batch_first=meta.batch_first) Y_val, _ = batchify(Y_val, bsz=meta.val_batch, random=False, batch_first=meta.batch_first) for epoch in range(1, meta.epochs + 1): epoch_start_time = time.time() meta.epoch = epoch X_train_i, random_idx = batchify( X_train, bsz=meta.train_batch, random=True, batch_first=meta.batch_first) Y_train_i, _ = batchify( Y_train, bsz=meta.train_batch, random=random_idx, batch_first=meta.batch_first) min_train_loss, max_train_loss, train_output, train_gt = train( model=model, X=X_train_i, Y=Y_train_i, optimizer=optimizer, criterion=criterion, score=score, scheduler=scheduler, meta=meta) if epoch % meta.val_interval == 0: val_loss, val_score, val_output, val_gt = evaluate( model=model, X=X_val, Y=Y_val, criterion=criterion, score=score, meta=meta) meta.min_train_loss.append(min_train_loss) meta.max_train_loss.append(max_train_loss) meta.val_loss.append(val_loss) meta.val_score.append(val_score) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid score {:5.2f}'.format( epoch, (time.time() - epoch_start_time), meta.val_loss[-1], meta.val_score[-1])) print('-' * 89) if use_neptune and NEPTUNE_IMPORTED: neptune.log_metric('min_train_loss', min_train_loss) neptune.log_metric('max_train_loss', max_train_loss) neptune.log_metric('val_{}'.format(meta.metric), val_loss) neptune.log_metric('val_pearson', val_score) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model if val_loss < 0.65 and debug: from matplotlib import pyplot as plt fig = plt.figure() plt.title('val') plt.subplot(211) plt.plot(val_output[50].cpu()) plt.subplot(212) plt.plot(val_gt[50].cpu()) plt.show() plt.close(fig) fig = plt.figure() plt.title('train') plt.subplot(211) plt.plot(train_output[50].cpu().detach()) plt.subplot(212) plt.plot(train_gt[50].cpu()) plt.show() plt.close(fig) if scheduler is not None: scheduler.step() # Fix some type issues meta.val_loss = [x.cpu() for x in meta.val_loss] meta.val_score = [x.cpu() for x in meta.val_score] np.savez(os.path.join(output_dir, '{}results_{}'.format(experiment_name, timestamp)), **meta.__dict__) # noqa np.savez(os.path.join(output_dir, '{}example_{}'.format(experiment_name, timestamp)), train_output=train_output.cpu().detach(), train_gt=train_gt.cpu(), val_output=val_output.cpu(), val_gt=val_gt.cpu()) torch.save(best_model.state_dict(), os.path.join(output_dir, '{}model_{}.pth'.format(experiment_name, timestamp)))
def main(): """Main function""" # Initialization args = parse_args() rank, local_rank, n_ranks = init_workers(args.distributed) config = load_config(args.config, output_dir=args.output_dir, data_config=args.data_config) os.makedirs(config['output_dir'], exist_ok=True) config_logging(verbose=args.verbose) logging.info('Initialized rank %i local_rank %i size %i', rank, local_rank, n_ranks) if rank == 0: logging.info('Configuration: %s', config) # Device and session configuration gpu = local_rank if args.rank_gpu else None configure_session(gpu=gpu, **config.get('device', {})) # Load the data data_config = config['data'] if rank == 0: logging.info('Loading data') datasets = get_datasets(rank=rank, n_ranks=n_ranks, **data_config) logging.debug('Datasets: %s', datasets) # Construct or reload the model if rank == 0: logging.info('Building the model') initial_epoch = 0 checkpoint_format = os.path.join(config['output_dir'], 'checkpoint-{epoch:03d}.h5') if args.resume: # Reload model from last checkpoint initial_epoch, model = reload_last_checkpoint( checkpoint_format, data_config['n_epochs'], distributed=args.distributed) else: # Build a new model model = get_model(**config['model']) # Configure the optimizer opt = get_optimizer(n_ranks=n_ranks, distributed=args.distributed, **config['optimizer']) # Compile the model train_config = config['train'] model.compile(optimizer=opt, loss=train_config['loss'], metrics=train_config['metrics']) if rank == 0: model.summary() # Save configuration to output directory if rank == 0: data_config['n_train'] = datasets['n_train'] data_config['n_valid'] = datasets['n_valid'] save_config(config) # Prepare the callbacks if rank == 0: logging.info('Preparing callbacks') callbacks = [] if args.distributed: # Broadcast initial variable states from rank 0 to all processes. callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) # Average metrics across workers callbacks.append(hvd.callbacks.MetricAverageCallback()) # Learning rate warmup train_config = config['train'] warmup_epochs = train_config.get('lr_warmup_epochs', 0) callbacks.append( hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=warmup_epochs, verbose=1)) # Learning rate decay schedule lr_schedule = train_config.get('lr_schedule', {}) if rank == 0: logging.info('Adding LR decay schedule: %s', lr_schedule) callbacks.append( tf.keras.callbacks.LearningRateScheduler( schedule=lambda epoch, lr: lr * lr_schedule.get(epoch, 1))) # Timing timing_callback = TimingCallback() callbacks.append(timing_callback) # Checkpointing and CSV logging from rank 0 only if rank == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format)) callbacks.append( tf.keras.callbacks.CSVLogger(os.path.join(config['output_dir'], 'history.csv'), append=args.resume)) if rank == 0: logging.debug('Callbacks: %s', callbacks) # Train the model if rank == 0: logging.info('Beginning training') fit_verbose = 1 if (args.verbose and rank == 0) else 2 model.fit(datasets['train_dataset'], steps_per_epoch=datasets['n_train_steps'], epochs=data_config['n_epochs'], validation_data=datasets['valid_dataset'], validation_steps=datasets['n_valid_steps'], callbacks=callbacks, initial_epoch=initial_epoch, verbose=fit_verbose) # Print training summary if rank == 0: print_training_summary(config['output_dir']) # Finalize if rank == 0: logging.info('All done!')
def main(): """Main function""" # Initialization args = parse_args() dist = init_workers(args.distributed) config = load_config(args) os.makedirs(config['output_dir'], exist_ok=True) config_logging(verbose=args.verbose) logging.info('Initialized rank %i size %i local_rank %i local_size %i', dist.rank, dist.size, dist.local_rank, dist.local_size) if dist.rank == 0: logging.info('Configuration: %s', config) # Device and session configuration gpu = dist.local_rank if args.rank_gpu else None if gpu is not None: logging.info('Taking gpu %i', gpu) configure_session(gpu=gpu, intra_threads=args.intra_threads, inter_threads=args.inter_threads, kmp_blocktime=args.kmp_blocktime, kmp_affinity=args.kmp_affinity, omp_num_threads=args.omp_num_threads) # Load the data data_config = config['data'] if dist.rank == 0: logging.info('Loading data') datasets = get_datasets(dist=dist, **data_config) logging.debug('Datasets: %s', datasets) # Construct or reload the model if dist.rank == 0: logging.info('Building the model') train_config = config['train'] initial_epoch = 0 checkpoint_format = os.path.join(config['output_dir'], 'checkpoint-{epoch:03d}.h5') if args.resume and os.path.exists(checkpoint_format.format(epoch=1)): # Reload model from last checkpoint initial_epoch, model = reload_last_checkpoint( checkpoint_format, data_config['n_epochs'], distributed=args.distributed) else: # Build a new model model = get_model(**config['model']) # Configure the optimizer opt = get_optimizer(distributed=args.distributed, **config['optimizer']) # Compile the model model.compile(optimizer=opt, loss=train_config['loss'], metrics=train_config['metrics']) if dist.rank == 0: model.summary() # Save configuration to output directory if dist.rank == 0: config['n_ranks'] = dist.size save_config(config) # Prepare the callbacks if dist.rank == 0: logging.info('Preparing callbacks') callbacks = [] if args.distributed: # Broadcast initial variable states from rank 0 to all processes. callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) # Average metrics across workers callbacks.append(hvd.callbacks.MetricAverageCallback()) # Learning rate decay schedule if 'lr_schedule' in config: global_batch_size = data_config['batch_size'] * dist.size callbacks.append( tf.keras.callbacks.LearningRateScheduler( get_lr_schedule(global_batch_size=global_batch_size, **config['lr_schedule']))) # Timing timing_callback = TimingCallback() callbacks.append(timing_callback) # Checkpointing and logging from rank 0 only if dist.rank == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format)) callbacks.append( tf.keras.callbacks.CSVLogger(os.path.join(config['output_dir'], 'history.csv'), append=args.resume)) if args.tensorboard: callbacks.append( tf.keras.callbacks.TensorBoard( os.path.join(config['output_dir'], 'tensorboard'))) # Early stopping patience = config.get('early_stopping_patience', None) if patience is not None: callbacks.append( tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=patience, verbose=1)) if dist.rank == 0: logging.debug('Callbacks: %s', callbacks) # Train the model if dist.rank == 0: logging.info('Beginning training') fit_verbose = 1 if (args.verbose and dist.rank == 0) else 2 model.fit(datasets['train_dataset'], steps_per_epoch=datasets['n_train_steps'], epochs=data_config['n_epochs'], validation_data=datasets['valid_dataset'], validation_steps=datasets['n_valid_steps'], callbacks=callbacks, initial_epoch=initial_epoch, verbose=fit_verbose) # Print training summary if dist.rank == 0: print_training_summary(config['output_dir'], args.print_fom) # Finalize if dist.rank == 0: logging.info('All done!')
def __init__(self, cfg, writer, img_writer, logger, run_id): # Copy shared config fields if "monodepth_options" in cfg: cfg["data"].update(cfg["monodepth_options"]) cfg["model"].update(cfg["monodepth_options"]) cfg["training"]["monodepth_loss"].update(cfg["monodepth_options"]) if "generated_depth_dir" in cfg["data"]: dataset_name = f"{cfg['data']['dataset']}_" \ f"{cfg['data']['width']}x{cfg['data']['height']}" depth_teacher = cfg["data"].get("depth_teacher", None) assert not (depth_teacher and cfg['model'].get('detph_estimator_weights') is not None) if depth_teacher is not None: cfg["data"]["generated_depth_dir"] += dataset_name + "/" + depth_teacher + "/" else: cfg["data"]["generated_depth_dir"] += dataset_name + "/" + cfg['model']['depth_estimator_weights'] + "/" # Setup seeds setup_seeds(cfg.get("seed", 1337)) if cfg["data"]["dataset_seed"] == "same": cfg["data"]["dataset_seed"] = cfg["seed"] # Setup device torch.backends.cudnn.benchmark = cfg["training"].get("benchmark", True) self.cfg = cfg self.writer = writer self.img_writer = img_writer self.logger = logger self.run_id = run_id self.mIoU = 0 self.fwAcc = 0 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.setup_segmentation_unlabeled() self.unlabeled_require_depth = (self.cfg["training"]["unlabeled_segmentation"] is not None and (self.cfg["training"]["unlabeled_segmentation"]["mix_mask"] == "depth" or self.cfg["training"]["unlabeled_segmentation"]["mix_mask"] == "depthcomp" or self.cfg["training"]["unlabeled_segmentation"]["mix_mask"] == "depthhist")) # Prepare depth estimates do_precalculate_depth = self.cfg["training"]["segmentation_lambda"] != 0 and self.unlabeled_require_depth and \ self.cfg['model']['segmentation_name'] != 'mtl_pad' use_depth_teacher = cfg["data"].get("depth_teacher", None) is not None if do_precalculate_depth or use_depth_teacher: assert not (do_precalculate_depth and use_depth_teacher) if not self.cfg["training"].get("disable_depth_estimator", False): print("Prepare depth estimates") depth_estimator = DepthEstimator(cfg) depth_estimator.prepare_depth_estimates() del depth_estimator torch.cuda.empty_cache() else: self.cfg["data"]["generated_depth_dir"] = None # Setup Dataloader load_labels, load_sequence = True, True if self.cfg["training"]["monodepth_lambda"] == 0: load_sequence = False if self.cfg["training"]["segmentation_lambda"] == 0: load_labels = False train_data_cfg = deepcopy(self.cfg["data"]) if not do_precalculate_depth and not use_depth_teacher: train_data_cfg["generated_depth_dir"] = None self.train_loader = build_loader(train_data_cfg, "train", load_labels=load_labels, load_sequence=load_sequence) if self.cfg["training"].get("minimize_entropy_unlabeled", False) or self.enable_unlabled_segmentation: unlabeled_segmentation_cfg = deepcopy(self.cfg["data"]) if not self.only_unlabeled and self.mix_use_gt: unlabeled_segmentation_cfg["load_onehot"] = True if self.only_unlabeled: unlabeled_segmentation_cfg.update({"load_unlabeled": True, "load_labeled": False}) elif self.only_labeled: unlabeled_segmentation_cfg.update({"load_unlabeled": False, "load_labeled": True}) else: unlabeled_segmentation_cfg.update({"load_unlabeled": True, "load_labeled": True}) if self.mix_video: assert not self.mix_use_gt and not self.only_labeled and not self.only_unlabeled, \ "Video sample indices are not compatible with non-video indices." unlabeled_segmentation_cfg.update({"only_sequences_with_segmentation": not self.mix_video, "restrict_to_subset": None}) self.unlabeled_loader = build_loader(unlabeled_segmentation_cfg, "train", load_labels=load_labels if not self.mix_video else False, load_sequence=load_sequence) else: self.unlabeled_loader = None self.val_loader = build_loader(self.cfg["data"], "val", load_labels=load_labels, load_sequence=load_sequence) self.n_classes = self.train_loader.n_classes # monodepth dataloader settings uses drop_last=True and shuffle=True even for val self.train_data_loader = data.DataLoader( self.train_loader, batch_size=self.cfg["training"]["batch_size"], num_workers=self.cfg["training"]["n_workers"], shuffle=self.cfg["data"]["shuffle_trainset"], pin_memory=True, # Setting to false will cause crash at the end of epoch drop_last=True, ) if self.unlabeled_loader is not None: self.unlabeled_data_loader = infinite_iterator(data.DataLoader( self.unlabeled_loader, batch_size=self.cfg["training"]["batch_size"], num_workers=self.cfg["training"]["n_workers"], shuffle=self.cfg["data"]["shuffle_trainset"], pin_memory=True, # Setting to false will cause crash at the end of epoch drop_last=True, )) self.val_batch_size = self.cfg["training"]["val_batch_size"] self.val_data_loader = data.DataLoader( self.val_loader, batch_size=self.val_batch_size, num_workers=self.cfg["training"]["n_workers"], pin_memory=True, # If using a dataset with odd number of samples (CamVid), the memory consumption suddenly increases for the # last batch. This can be circumvented by dropping the last batch. Only do that if it is necessary for your # system as it will result in an incomplete validation set. # drop_last=True, ) # Setup Model self.model = get_model(cfg["model"], self.n_classes).to(self.device) # print(self.model) assert not (self.enable_unlabled_segmentation and self.cfg["training"]["save_monodepth_ema"]) if self.enable_unlabled_segmentation and not self.only_labeled: print("Create segmentation ema model.") self.ema_model = self.create_ema_model(self.model).to(self.device) elif self.cfg["training"]["save_monodepth_ema"]: print("Create depth ema model.") # TODO: Try to remove unnecessary components and fit into gpu for better performance self.ema_model = self.create_ema_model(self.model) # .to(self.device) else: self.ema_model = None # Setup optimizer, lr_scheduler and loss function optimizer_cls = get_optimizer(cfg) optimizer_params = {k: v for k, v in cfg["training"]["optimizer"].items() if k not in ["name", "backbone_lr", "pose_lr", "depth_lr", "segmentation_lr"]} train_params = get_train_params(self.model, self.cfg) self.optimizer = optimizer_cls(train_params, **optimizer_params) self.scheduler = get_scheduler(self.optimizer, self.cfg["training"]["lr_schedule"]) # Creates a GradScaler once at the beginning of training. self.scaler = GradScaler(enabled=self.cfg["training"]["amp"]) self.loss_fn = get_segmentation_loss_function(self.cfg) self.monodepth_loss_calculator_train = get_monodepth_loss(self.cfg, is_train=True) self.monodepth_loss_calculator_val = get_monodepth_loss(self.cfg, is_train=False, batch_size=self.val_batch_size) if cfg["training"]["early_stopping"] is None: logger.info("Using No Early Stopping") self.earlyStopping = None else: self.earlyStopping = EarlyStopping( patience=round(cfg["training"]["early_stopping"]["patience"] / cfg["training"]["val_interval"]), min_delta=cfg["training"]["early_stopping"]["min_delta"], cumulative_delta=cfg["training"]["early_stopping"]["cum_delta"], logger=logger )
def train(): model = STARVE() # get style target style_img_path = DatasetParam.style_img_path style_target = model(tf.constant(load_img(style_img_path)))['style'] # get content image path list if DatasetParam.use_video: content_img_list = glob.glob( join(TrainParam.video_frames_dir, '*.{}'.format(DatasetParam.img_fmt))) content_img_list.sort(key=lambda x: int(splitext(basename(x))[0])) else: content_img_list = [DatasetParam.content_img_path] # record all frames from last iteration img_sqe = [] for n_img, content_img_path in enumerate(content_img_list): # Call tf.function each time, or there will be # ValueError: tf.function-decorated function tried to create variables on non-first call # because of issues with lazy execution. # https://www.machinelearningplus.com/deep-learning/how-use-tf-function-to-speed-up-python-code-tensorflow/ tf_train_step = tf.function(train_step) optimizer = get_optimizer() content_target = model(tf.constant( load_img(content_img_path)))['content'] generated_image = tf.Variable( load_img(content_img_path, do_preprocess=False)) pbar = tqdm(range(TrainParam.n_step)) pbar.set_description_str('[{}/{} {}]'.format( n_img + 1, len(content_img_list), basename(content_img_path))) for step in pbar: tf_train_step(model, generated_image, optimizer, content_target, style_target) if (step + 1) % TrainParam.draw_step == 0: plt.imsave( join(TrainParam.iter_img_dir, "{}.{}".format(step + 1, DatasetParam.img_fmt)), tensor_to_image(generated_image)) else: plt.imsave( join(TrainParam.stylized_img_dir, basename(content_img_path)), tensor_to_image(generated_image)) img_sqe.append(generated_image) # long term consistency if DatasetParam.use_video: direction = -1 step_bar = tqdm(range(TrainParam.consistency_step)) step_bar.set_description_str('[consistency step]') # new it new_img_sqe = [] for step in step_bar: tf_train_step = tf.function(consistent_train_step) optimizer = get_optimizer() pbar = tqdm(range(len(content_img_list))) pbar.set_description_str('[{}/{}]'.format(len(content_img_list), step + 1)) for frame_idx in pbar: # can try to optimize them by putting them outside of the loop # or init at begining of this function content_img_path = content_img_list[frame_idx] content_target = model(tf.constant( load_img(content_img_path)))['content'] generated_image = tf.Variable( init_img(img_sqe, frame_idx, direction)) tf_train_step(model, optimizer, content_target, style_target, frame_idx, img_sqe, direction, generated_image) new_img_sqe.append(generated_image) if frame_idx % TrainParam.check_frame_step == 0: plt.imsave( join(TrainParam.iter_consistent_frames_dir, "{}.{}".format(step + 1, DatasetParam.img_fmt)), tensor_to_image(generated_image)) if step % TrainParam.change_passdir_step == 0: direction = -direction img_sqe = new_img_sqe for frame_idx, generated_image in enumerate(img_sqe): content_img_path = content_img_list[frame_idx] plt.imsave( join(TrainParam.consistent_frames_dir, basename(content_img_path)), tensor_to_image(generated_image)) return
def main(): global best_iou global best_dice # model model = smp.Unet(encoder_name=configs.encoder, encoder_weights=configs.encoder_weights, classes=configs.num_classes, activation=configs.activation) if len(configs.gpu_id) > 1: model = nn.DataParallel(model) model.cuda() # get files filenames = glob(configs.dataset + "masks/*") filenames = [os.path.basename(i) for i in filenames] # random split dataset into train and val train_files, val_files = train_test_split(filenames, test_size=0.2) # define different aug if configs.use_strong_aug: transform_train = stong_aug() else: transform_train = get_training_augmentation() transform_valid = get_valid_augmentation() # make data loader for train and val train_dataset = SegDataset(train_files, phase="train", transforms=transform_train) valid_dataset = SegDataset(val_files, phase="valid", transforms=transform_valid) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=configs.bs, shuffle=True, num_workers=configs.workers) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=configs.bs, shuffle=False, num_workers=configs.workers) optimizer = get_optimizer(model) loss_func = get_loss_func(configs.loss_func) criterion = loss_func().cuda() # tensorboardX writer writer = SummaryWriter(configs.log_dir) # set lr scheduler method if configs.lr_scheduler == "step": scheduler_default = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) elif configs.lr_scheduler == "on_loss": scheduler_default = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.2, patience=5, verbose=False) elif configs.lr_scheduler == "on_iou": scheduler_default = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', factor=0.2, patience=5, verbose=False) elif configs.lr_scheduler == "on_dice": scheduler_default = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', factor=0.2, patience=5, verbose=False) elif configs.lr_scheduler == "cosine": scheduler_default = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, configs.epochs - configs.warmup_epo) else: scheduler_default = torch.optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.1) # scheduler with warmup if configs.warmup: scheduler = GradualWarmupScheduler(optimizer, multiplier=configs.warmup_factor, total_epoch=configs.warmup_epo, after_scheduler=scheduler_default) else: scheduler = scheduler_default for epoch in range(configs.epochs): print('\nEpoch: [%d | %d] LR: %.8f' % (epoch + 1, configs.epochs, optimizer.param_groups[0]['lr'])) train_loss, train_dice, train_iou = train(train_loader, model, criterion, optimizer, epoch, writer) valid_loss, valid_dice, valid_iou = eval(valid_loader, model, criterion, epoch, writer) if configs.lr_scheduler == "step" or configs.lr_scheduler == "cosine" or configs.warmup: scheduler.step(epoch) elif configs.lr_scheduler == "on_iou": scheduler.step(valid_iou) elif configs.lr_scheduler == "on_dice": scheduler.step(valid_dice) elif configs.lr_scheduler == "on_loss": scheduler.step(valid_loss) # save model is_best_iou = valid_iou > best_iou is_best_dice = valid_dice > best_dice best_iou = max(valid_iou, best_iou) best_dice = max(valid_dice, best_dice) print("Best {}: {} ,Best Dice: {}".format(configs.metric, best_iou, best_dice)) save_checkpoint({ 'state_dict': model.state_dict(), }, is_best_iou, is_best_dice)
def main(): """Main function""" # Initialization args = parse_args() rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config) train_config = config['training'] output_dir = os.path.expandvars(config['output_dir']) checkpoint_format = os.path.join(output_dir, 'checkpoints', 'checkpoint-{epoch}.h5') if rank==0: os.makedirs(output_dir, exist_ok=True) # Loggging config_logging(verbose=args.verbose) logging.info('Initialized rank %i out of %i', rank, n_ranks) if args.show_config: logging.info('Command line config: %s', args) if rank == 0: logging.info('Job configuration: %s', config) logging.info('Saving job outputs to %s', output_dir) # Configure session device_config = config.get('device', {}) configure_session(**device_config) # Load the data train_gen, valid_gen = get_datasets(batch_size=train_config['batch_size'], **config['data']) # Build the model model = get_model(**config['model']) # Configure optimizer opt = get_optimizer(n_ranks=n_ranks, dist_wrapper=hvd.DistributedOptimizer, **config['optimizer']) # Compile the model model.compile(loss=train_config['loss'], optimizer=opt, metrics=train_config['metrics']) if rank == 0: model.summary() # Prepare the training callbacks callbacks = get_basic_callbacks(args.distributed) # Learning rate warmup warmup_epochs = train_config.get('lr_warmup_epochs', 0) callbacks.append(hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=warmup_epochs, verbose=1)) # Learning rate decay schedule for lr_schedule in train_config.get('lr_schedule', []): if rank == 0: logging.info('Adding LR schedule: %s', lr_schedule) callbacks.append(hvd.callbacks.LearningRateScheduleCallback(**lr_schedule)) # Checkpoint only from rank 0 if rank == 0: os.makedirs(os.path.dirname(checkpoint_format), exist_ok=True) callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format)) # Timing callback timing_callback = TimingCallback() callbacks.append(timing_callback) # Train the model train_steps_per_epoch = max([len(train_gen) // n_ranks, 1]) valid_steps_per_epoch = max([len(valid_gen) // n_ranks, 1]) history = model.fit_generator(train_gen, epochs=train_config['n_epochs'], steps_per_epoch=train_steps_per_epoch, validation_data=valid_gen, validation_steps=valid_steps_per_epoch, callbacks=callbacks, workers=4, verbose=2 if rank==0 else 0) # Save training history if rank == 0: # Print some best-found metrics if 'val_acc' in history.history.keys(): logging.info('Best validation accuracy: %.3f', max(history.history['val_acc'])) if 'val_top_k_categorical_accuracy' in history.history.keys(): logging.info('Best top-5 validation accuracy: %.3f', max(history.history['val_top_k_categorical_accuracy'])) logging.info('Average time per epoch: %.3f s', np.mean(timing_callback.times)) np.savez(os.path.join(output_dir, 'history'), n_ranks=n_ranks, **history.history) # Drop to IPython interactive shell if args.interactive and (rank == 0): logging.info('Starting IPython interactive session') import IPython IPython.embed() if rank == 0: logging.info('All done!')
def main(): """Main function""" # Initialization args = parse_args() rank, local_rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config) # Configure logging config_logging(verbose=args.verbose) logging.info('Initialized rank %i local_rank %i size %i', rank, local_rank, n_ranks) # Device configuration configure_session(gpu=local_rank, **config.get('device', {})) # Load the data train_data, valid_data = get_datasets(rank=rank, n_ranks=n_ranks, **config['data']) if rank == 0: logging.info(train_data) logging.info(valid_data) # Construct the model and optimizer model = get_model(**config['model']) optimizer = get_optimizer(n_ranks=n_ranks, **config['optimizer']) train_config = config['train'] # Custom metrics for pixel accuracy and IoU metrics = [PixelAccuracy(), PixelIoU(name='iou', num_classes=3)] # Compile the model model.compile(loss=train_config['loss'], optimizer=optimizer, metrics=metrics) # Print a model summary if rank == 0: model.summary() # Prepare the callbacks callbacks = [] if args.distributed: # Broadcast initial variable states from rank 0 to all processes. callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) # Average metrics across workers callbacks.append(hvd.callbacks.MetricAverageCallback()) # Learning rate warmup warmup_epochs = train_config.get('lr_warmup_epochs', 0) callbacks.append(hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=warmup_epochs, verbose=1)) # Timing timing_callback = TimingCallback() callbacks.append(timing_callback) # Checkpointing and CSV logging from rank 0 only #if rank == 0: # callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format)) # callbacks.append(tf.keras.callbacks.CSVLogger( # os.path.join(config['output_dir'], 'history.csv'), append=args.resume)) if rank == 0: logging.debug('Callbacks: %s', callbacks) # Train the model verbosity = 2 if rank==0 or args.verbose else 0 history = model.fit(train_data, validation_data=valid_data, epochs=train_config['n_epochs'], callbacks=callbacks, verbose=verbosity) # All done if rank == 0: logging.info('All done!')
def main(): """Main function""" # Initialization args = parse_args() rank, n_ranks = init_workers(args.distributed) # Load configuration config = load_config(args.config) train_config = config['training'] output_dir = os.path.expandvars(config['output_dir']) checkpoint_format = os.path.join(output_dir, 'checkpoints', 'checkpoint-{epoch}.h5') os.makedirs(output_dir, exist_ok=True) # Logging config_logging(verbose=args.verbose, output_dir=output_dir) logging.info('Initialized rank %i out of %i', rank, n_ranks) if args.show_config: logging.info('Command line config: %s', args) if rank == 0: logging.info('Job configuration: %s', config) logging.info('Saving job outputs to %s', output_dir) # Configure session if args.distributed: gpu = hvd.local_rank() else: gpu = args.gpu device_config = config.get('device', {}) configure_session(gpu=gpu, **device_config) # Load the data train_gen, valid_gen = get_datasets(batch_size=train_config['batch_size'], **config['data_and_model'], **config['data']) # Build the model # if (type(config['data']['n_components']) is int): # rho_length_in = config['data']['n_components'] # else: rho_length_in = config['model']['rho_length_out'] model = get_model(rho_length_in=rho_length_in, **config['data_and_model'], **config['model']) # Configure optimizer opt = get_optimizer(n_ranks=n_ranks, distributed=args.distributed, **config['optimizer']) # Compile the model model.compile(loss=train_config['loss'], optimizer=opt, metrics=train_config['metrics']) if rank == 0: model.summary() # Prepare the training callbacks callbacks = [] if args.distributed: # Broadcast initial variable states from rank 0 to all processes. callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) # # Learning rate warmup # warmup_epochs = train_config.('lr_warmup_epochs', 0) # callbacks.append(hvd.callbacks.LearningRateWarmupCallback( # warmup_epochs=warmup_epochs, verbose=1)) # # Learning rate decay schedule # for lr_schedule in train_config.get('lr_schedule', []): # if rank == 0: # logging.info('Adding LR schedule: %s', lr_schedule) # callbacks.append(hvd.callbacks.LearningRateScheduleCallback(**lr_schedule)) # Checkpoint only from rank 0 if rank == 0: #os.makedirs(os.path.dirname(checkpoint_format), exist_ok=True) #callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format)) #callbacks.append(keras.callbacks.EarlyStopping(monitor='val_loss', # patience=5)) callbacks.append(keras.callbacks.ModelCheckpoint(filepath=os.path.join(output_dir, 'model.h5'), monitor='val_mean_absolute_error', save_best_only=False, verbose=2)) # Timing timing_callback = TimingCallback() callbacks.append(timing_callback) # Train the model steps_per_epoch = len(train_gen) // n_ranks # import pdb # pdb.set_trace() history = model.fit_generator(train_gen, epochs=train_config['n_epochs'], steps_per_epoch=steps_per_epoch, validation_data=valid_gen, validation_steps=len(valid_gen), callbacks=callbacks, workers=4, verbose=1) # Save training history if rank == 0: # Print some best-found metrics if 'val_acc' in history.history.keys(): logging.info('Best validation accuracy: %.3f', max(history.history['val_acc'])) if 'val_top_k_categorical_accuracy' in history.history.keys(): logging.info('Best top-5 validation accuracy: %.3f', max(history.history['val_top_k_categorical_accuracy'])) if 'val_mean_absolute_error' in history.history.keys(): logging.info('Best validation mae: %.3f', min(history.history['val_mean_absolute_error'])) logging.info('Average time per epoch: %.3f s', np.mean(timing_callback.times)) np.savez(os.path.join(output_dir, 'history'), n_ranks=n_ranks, **history.history) # Drop to IPython interactive shell if args.interactive and (rank == 0): logging.info('Starting IPython interactive session') import IPython IPython.embed() if rank == 0: logging.info('All done!')