'--benchmark', action='store_true', help="Benchmark the speed of the model + postprocessing") parser.add_argument( '--config', help="A list of KEY=VALUE to overwrite those defined in config.py", nargs='+') parser.add_argument('--compact', help='Save a model to .pb') parser.add_argument('--serving', help='Save a model to serving file') args = parser.parse_args() if args.config: cfg.update_args(args.config) register_mot(cfg.DATA.BASEDIR) # add the mot datasets to the registry MODEL = ResNetFPNModel() if cfg.MODE_FPN else ResNetC4Model() if not tf.test.is_gpu_available(): from tensorflow.python.framework import test_util assert get_tf_version_tuple() >= (1, 7) and test_util.IsMklEnabled(), \ "Inference requires either GPU support or MKL support!" assert args.load finalize_configs(is_training=False) if args.predict or args.visualize: cfg.TEST.RESULT_SCORE_THRESH = cfg.TEST.RESULT_SCORE_THRESH_VIS if args.visualize: do_visualize(MODEL, args.load) else: predcfg = PredictConfig(
def main(args): # "spawn/forkserver" is safer than the default "fork" method and # produce more deterministic behavior & memory saving # However its limitation is you cannot pass a lambda function to subprocesses. import multiprocessing as mp mp.set_start_method('spawn') if get_tf_version_tuple() < (1, 6): # https://github.com/tensorflow/tensorflow/issues/14657 logger.warn( "TF<1.6 has a bug which may lead to crash in FasterRCNN if you're unlucky." ) # Setup logging ... is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info("Environment Information:\n" + collect_env_info()) finalize_configs(is_training=True) # Create model MODEL = ResNetFPNModel() if cfg.MODE_FPN else ResNetC4Model() # Compute the training schedule from the number of GPUs ... stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)] factor = 8. / cfg.TRAIN.NUM_GPUS for idx, steps in enumerate(cfg.TRAIN.LR_SCHEDULE[:-1]): mult = 0.1**(idx + 1) lr_schedule.append( (steps * factor // stepnum, cfg.TRAIN.BASE_LR * mult)) logger.info("Warm Up Schedule (steps, value): " + str(warmup_schedule)) logger.info("LR Schedule (epochs, value): " + str(lr_schedule)) train_dataflow = get_train_dataflow() # This is what's commonly referred to as "epochs" total_passes = cfg.TRAIN.LR_SCHEDULE[-1] * 8 / train_dataflow.size() logger.info( "Total passes of the training set is: {:.5g}".format(total_passes)) # Create callbacks ... callbacks = [ PeriodicCallback(ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1), every_k_epochs=cfg.TRAIN.CHECKPOINT_PERIOD), # linear warmup ScheduledHyperParamSetter('learning_rate', warmup_schedule, interp='linear', step_based=True), ScheduledHyperParamSetter('learning_rate', lr_schedule), GPUMemoryTracker(), HostMemoryTracker(), ThroughputTracker(samples_per_step=cfg.TRAIN.NUM_GPUS), EstimatedTimeLeft(median=True), SessionRunTimeout(60000) # 1 minute timeout #AMLCallback() #GPUUtilizationTracker() ] if cfg.TRAIN.EVAL_PERIOD > 0: callbacks.extend([ EvalCallback(dataset, *MODEL.get_inference_tensor_names(), args.logdir) for dataset in cfg.DATA.VAL ]) if is_horovod and hvd.rank() > 0: session_init = None else: if args.load: # ignore mismatched values, so you can `--load` a model for fine-tuning session_init = SmartInit(args.load, ignore_mismatch=True) else: session_init = SmartInit(cfg.BACKBONE.WEIGHTS) traincfg = TrainConfig(model=MODEL, data=QueueInput(train_dataflow), callbacks=callbacks, monitors=[AMLMonitor()], steps_per_epoch=stepnum, max_epoch=cfg.TRAIN.LR_SCHEDULE[-1] * factor // stepnum, session_init=session_init, starting_epoch=cfg.TRAIN.STARTING_EPOCH) if is_horovod: trainer = HorovodTrainer(average=False) else: # nccl mode appears faster than cpu mode trainer = SyncMultiGPUTrainerReplicated(cfg.TRAIN.NUM_GPUS, average=False, mode='nccl') launch_train_with_config(traincfg, trainer)