def _setup_logging(logdir, is_horovod): # Setup logging ... if is_horovod: hvd.init() if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(logdir, 'd') logger.info("Environment Information:\n" + collect_env_info())
def main(args): # "spawn/forkserver" is safer than the default "fork" method and # produce more deterministic behavior & memory saving # However its limitation is you cannot pass a lambda function to subprocesses. import multiprocessing as mp mp.set_start_method('spawn') if get_tf_version_tuple() < (1, 6): # https://github.com/tensorflow/tensorflow/issues/14657 logger.warn( "TF<1.6 has a bug which may lead to crash in FasterRCNN if you're unlucky." ) # Setup logging ... is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info("Environment Information:\n" + collect_env_info()) finalize_configs(is_training=True) # Create model MODEL = ResNetFPNModel() if cfg.MODE_FPN else ResNetC4Model() # Compute the training schedule from the number of GPUs ... stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)] factor = 8. / cfg.TRAIN.NUM_GPUS for idx, steps in enumerate(cfg.TRAIN.LR_SCHEDULE[:-1]): mult = 0.1**(idx + 1) lr_schedule.append( (steps * factor // stepnum, cfg.TRAIN.BASE_LR * mult)) logger.info("Warm Up Schedule (steps, value): " + str(warmup_schedule)) logger.info("LR Schedule (epochs, value): " + str(lr_schedule)) train_dataflow = get_train_dataflow() # This is what's commonly referred to as "epochs" total_passes = cfg.TRAIN.LR_SCHEDULE[-1] * 8 / train_dataflow.size() logger.info( "Total passes of the training set is: {:.5g}".format(total_passes)) # Create callbacks ... callbacks = [ PeriodicCallback(ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1), every_k_epochs=cfg.TRAIN.CHECKPOINT_PERIOD), # linear warmup ScheduledHyperParamSetter('learning_rate', warmup_schedule, interp='linear', step_based=True), ScheduledHyperParamSetter('learning_rate', lr_schedule), GPUMemoryTracker(), HostMemoryTracker(), ThroughputTracker(samples_per_step=cfg.TRAIN.NUM_GPUS), EstimatedTimeLeft(median=True), SessionRunTimeout(60000) # 1 minute timeout #AMLCallback() #GPUUtilizationTracker() ] if cfg.TRAIN.EVAL_PERIOD > 0: callbacks.extend([ EvalCallback(dataset, *MODEL.get_inference_tensor_names(), args.logdir) for dataset in cfg.DATA.VAL ]) if is_horovod and hvd.rank() > 0: session_init = None else: if args.load: # ignore mismatched values, so you can `--load` a model for fine-tuning session_init = SmartInit(args.load, ignore_mismatch=True) else: session_init = SmartInit(cfg.BACKBONE.WEIGHTS) traincfg = TrainConfig(model=MODEL, data=QueueInput(train_dataflow), callbacks=callbacks, monitors=[AMLMonitor()], steps_per_epoch=stepnum, max_epoch=cfg.TRAIN.LR_SCHEDULE[-1] * factor // stepnum, session_init=session_init, starting_epoch=cfg.TRAIN.STARTING_EPOCH) if is_horovod: trainer = HorovodTrainer(average=False) else: # nccl mode appears faster than cpu mode trainer = SyncMultiGPUTrainerReplicated(cfg.TRAIN.NUM_GPUS, average=False, mode='nccl') launch_train_with_config(traincfg, trainer)
) args = parser.parse_args() if args.config: cfg.update_args(args.config) register_coco(cfg.DATA.BASEDIR) # add COCO datasets to the registry # Setup logger ... is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size())) if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info("Environment Information:\n" + collect_env_info()) finalize_configs(is_training=True) # Compute the training schedule from the number of GPUs ... stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)] factor = 8. / cfg.TRAIN.NUM_GPUS for idx, steps in enumerate(cfg.TRAIN.LR_SCHEDULE[:-1]): mult = 0.1**(idx + 1) lr_schedule.append(
try: register_voc(cfg.DATA.BASEDIR) # add VOC datasets to the registry except: logger.warning('VOC does not find!') try: register_coco(cfg.DATA.BASEDIR) # add COCO datasets to the registry except: logger.warning('COCO does not find!') # Setup logging ... is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info('Environment Information:\n' + collect_env_info()) finalize_configs(is_training=True) # Create model MODEL = ResNetFPNModel() if cfg.MODE_FPN else ResNetC4Model() # Compute the training schedule from the number of GPUs ... stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)] factor = 8. / cfg.TRAIN.NUM_GPUS
choices=[50, 101]) parser.add_argument('--logdir', default='train_log/ResNet-GN') parser.add_argument('--WS', action='store_true', help='Use Weight Standardization') args = parser.parse_args() model = Model() model.depth = args.depth model.use_WS = args.WS if args.eval: batch = 128 # something that can run on one gpu ds = get_imagenet_dataflow(args.data, 'val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir(args.logdir, 'd') try: from tensorpack.tfutils import collect_env_info logger.info("\n" + collect_env_info()) except Exception: pass config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1)) launch_train_with_config(config, trainer)