def __init__( self, # Model Params input_format, # NCHW or NHWC compute_format, # NCHW or NHWC n_channels, activation_fn, weight_init_method, model_variant, input_shape, mask_shape, input_normalization_method, # Training HParams augment_data, loss_fn_name, # Runtime HParams amp, xla, # Directory Params model_dir=None, log_dir=None, sample_dir=None, data_dir=None, dataset_name=None, dataset_hparams=None, # Debug Params log_every_n_steps=1, debug_verbosity=0, seed=None): if dataset_hparams is None: dataset_hparams = dict() if compute_format not in ["NHWC", 'NCHW']: raise ValueError( "Unknown `compute_format` received: %s (allowed: ['NHWC', 'NCHW'])" % compute_format) if input_format not in ["NHWC", 'NCHW']: raise ValueError( "Unknown `input_format` received: %s (allowed: ['NHWC', 'NCHW'])" % input_format) if n_channels not in [1, 3]: raise ValueError( "Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))" % n_channels) if data_dir is not None and not os.path.exists(data_dir): raise ValueError("The `data_dir` received does not exists: %s" % data_dir) if hvd_utils.is_using_hvd(): hvd.init() if hvd.rank() == 0: print("Horovod successfully initialized ...") tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None else: tf_seed = 2 * seed if seed is not None else None # ============================================ # Optimisation Flags - Do not remove # ============================================ os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd( ) else str(hvd.size()) print("WORLD_SIZE", hvd.size()) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' # ================================================= self.xla = xla if amp: if not hvd_utils.is_using_hvd() or hvd.rank() == 0: print("TF AMP is activated - Experimental Feature") os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" # ================================================= model_hparams = tf.contrib.training.HParams( # Model Params input_format=input_format, compute_format=compute_format, input_shape=input_shape, mask_shape=mask_shape, n_channels=n_channels, activation_fn=activation_fn, weight_init_method=weight_init_method, model_variant=model_variant, input_normalization_method=input_normalization_method, # Training HParams augment_data=augment_data, loss_fn_name=loss_fn_name, # Runtime Params amp=amp, # Debug Params log_every_n_steps=log_every_n_steps, debug_verbosity=debug_verbosity, seed=tf_seed) run_config_additional = tf.contrib.training.HParams( dataset_hparams=dataset_hparams, model_dir=model_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None, log_dir=log_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None, sample_dir=sample_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None, data_dir=data_dir, num_preprocessing_threads=32, ) if not hvd_utils.is_using_hvd() or hvd.rank() == 0: try: os.makedirs(sample_dir) except FileExistsError: pass self.run_hparams = Runner._build_hparams(model_hparams, run_config_additional) if not hvd_utils.is_using_hvd() or hvd.rank() == 0: print('Defining Model Estimator ...\n') self._model = UNet_v1( model_name="UNet_v1", input_format=self.run_hparams.input_format, compute_format=self.run_hparams.compute_format, n_output_channels=1, unet_variant=self.run_hparams.model_variant, weight_init_method=self.run_hparams.weight_init_method, activation_fn=self.run_hparams.activation_fn) if self.run_hparams.seed is not None: if not hvd_utils.is_using_hvd() or hvd.rank() == 0: print("Deterministic Run - Seed: %d\n" % seed) tf.set_random_seed(self.run_hparams.seed) np.random.seed(self.run_hparams.seed) random.seed(self.run_hparams.seed) if dataset_name not in known_datasets.keys(): raise RuntimeError( "The dataset `%s` is unknown, allowed values: %s ..." % (dataset_name, list(known_datasets.keys()))) self.dataset = known_datasets[dataset_name]( data_dir=data_dir, **self.run_hparams.dataset_hparams) self.num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size()
def parse_cmdline(): p = argparse.ArgumentParser(description="JoC-UNet_v1-TF") p.add_argument( '--unet_variant', default="tinyUNet", choices=UNet_v1.authorized_models_variants, type=str, required=False, help= """Which model size is used. This parameter control directly the size and the number of parameters""" ) p.add_argument( '--activation_fn', choices=authorized_activation_fn, type=str, default="relu", required=False, help= """Which activation function is used after the convolution layers""") p.add_argument('--exec_mode', choices=[ 'train', 'train_and_evaluate', 'evaluate', 'training_benchmark', 'inference_benchmark' ], type=str, required=True, help="""Which execution mode to run the model into""") p.add_argument( '--iter_unit', choices=['epoch', 'batch'], type=str, required=True, help="""Will the model be run for X batches or X epochs ?""") p.add_argument('--num_iter', type=int, required=True, help="""Number of iterations to run.""") p.add_argument('--batch_size', type=int, required=True, help="""Size of each minibatch per GPU.""") p.add_argument( '--warmup_step', default=200, type=int, required=False, help= """Number of steps considered as warmup and not taken into account for performance measurements.""" ) p.add_argument( '--results_dir', type=str, required=True, help= """Directory in which to write training logs, summaries and checkpoints.""" ) _add_bool_argument( parser=p, name="save_eval_results_to_json", default=False, required=False, help="Whether to save evaluation results in JSON format.") p.add_argument('--data_dir', required=False, default=None, type=str, help="Path to dataset directory") p.add_argument( '--dataset_name', choices=list(known_datasets.keys()), type=str, required=True, help= """Name of the dataset used in this run (only DAGM2007 is supported atm.)""" ) p.add_argument( '--dataset_classID', default=None, type=int, required=False, help= """ClassID to consider to train or evaluate the network (used for DAGM).""" ) p.add_argument( '--data_format', choices=['NHWC', 'NCHW'], type=str, default="NCHW", required=False, help="""Which Tensor format is used for computation inside the mode""") _add_bool_argument( parser=p, name="use_tf_amp", default=False, required=False, help= "Enable Automatic Mixed Precision to speedup FP32 computation using tensor cores" ) _add_bool_argument(parser=p, name="use_xla", default=False, required=False, help="Enable Tensorflow XLA to maximise performance.") p.add_argument( '--weight_init_method', choices=UNet_v1.authorized_weight_init_methods, default="he_normal", type=str, required=False, help= """Which initialisation method is used to randomly intialize the model during training""" ) p.add_argument('--learning_rate', default=1e-5, type=float, required=False, help="""Learning rate value.""") p.add_argument('--learning_rate_decay_factor', default=0.75, type=float, required=False, help="""Decay factor to decrease the learning rate.""") p.add_argument('--learning_rate_decay_steps', default=500, type=int, required=False, help="""Decay factor to decrease the learning rate.""") p.add_argument('--rmsprop_decay', default=0.9, type=float, required=False, help="""RMSProp - Decay value.""") p.add_argument('--rmsprop_momentum', default=0.8, type=float, required=False, help="""RMSProp - Momentum value.""") p.add_argument('--weight_decay', default=1e-4, type=float, required=False, help="""Weight Decay scale factor""") _add_bool_argument(parser=p, name="use_auto_loss_scaling", default=False, required=False, help="Use AutoLossScaling with TF-AMP") p.add_argument('--loss_fn_name', type=str, default="adaptive_loss", required=False, help="""Loss function Name to use to train the network""") _add_bool_argument(parser=p, name="augment_data", default=True, required=False, help="Choose whether to use data augmentation") p.add_argument( '--display_every', type=int, default=50, required=False, help="""How often (in batches) to print out debug information.""") p.add_argument( '--debug_verbosity', choices=[0, 1, 2], default=0, type=int, required=False, help= """Verbosity Level: 0 minimum, 1 with layer creation debug info, 2 with layer + var creation debug info.""" ) p.add_argument('--seed', type=int, default=None, help="""Random seed.""") FLAGS, unknown_args = p.parse_known_args() if len(unknown_args) > 0: for bad_arg in unknown_args: print("ERROR: Unknown command line arg: %s" % bad_arg) raise ValueError("Invalid command line arg(s)") return FLAGS