def do_visualize(model, model_path, nr_visualize=100, output_dir='output'): """ Visualize some intermediate results (proposals, raw predictions) inside the pipeline. """ df = get_train_dataflow() df.reset_state() pred = OfflinePredictor( PredictConfig(model=model, session_init=SmartInit(model_path), input_names=['image', 'gt_boxes', 'gt_labels'], output_names=[ 'generate_{}_proposals/boxes'.format( 'fpn' if cfg.MODE_FPN else 'rpn'), 'generate_{}_proposals/scores'.format( 'fpn' if cfg.MODE_FPN else 'rpn'), 'fastrcnn_all_scores', 'output/boxes', 'output/scores', 'output/labels', ])) if os.path.isdir(output_dir): shutil.rmtree(output_dir) fs.mkdir_p(output_dir) with tqdm.tqdm(total=nr_visualize) as pbar: for idx, dp in itertools.islice(enumerate(df), nr_visualize): img, gt_boxes, gt_labels = dp['image'], dp['gt_boxes'], dp[ 'gt_labels'] rpn_boxes, rpn_scores, all_scores, \ final_boxes, final_scores, final_labels = pred(img, gt_boxes, gt_labels) # draw groundtruth boxes gt_viz = draw_annotation(img, gt_boxes, gt_labels) # draw best proposals for each groundtruth, to show recall proposal_viz, good_proposals_ind = draw_proposal_recall( img, rpn_boxes, rpn_scores, gt_boxes) # draw the scores for the above proposals score_viz = draw_predictions(img, rpn_boxes[good_proposals_ind], all_scores[good_proposals_ind]) results = [ DetectionResult(*args) for args in zip(final_boxes, final_scores, final_labels, [None] * len(final_labels)) ] final_viz = draw_final_outputs(img, results) viz = tpviz.stack_patches( [gt_viz, proposal_viz, score_viz, final_viz], 2, 2) if os.environ.get('DISPLAY', None): tpviz.interactive_imshow(viz) cv2.imwrite("{}/{:03d}.png".format(output_dir, idx), viz) pbar.update()
def main(args): # "spawn/forkserver" is safer than the default "fork" method and # produce more deterministic behavior & memory saving # However its limitation is you cannot pass a lambda function to subprocesses. import multiprocessing as mp mp.set_start_method('spawn') if get_tf_version_tuple() < (1, 6): # https://github.com/tensorflow/tensorflow/issues/14657 logger.warn( "TF<1.6 has a bug which may lead to crash in FasterRCNN if you're unlucky." ) # Setup logging ... is_horovod = cfg.TRAINER == 'horovod' if is_horovod: hvd.init() if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(args.logdir, 'd') logger.info("Environment Information:\n" + collect_env_info()) finalize_configs(is_training=True) # Create model MODEL = ResNetFPNModel() if cfg.MODE_FPN else ResNetC4Model() # Compute the training schedule from the number of GPUs ... stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)] factor = 8. / cfg.TRAIN.NUM_GPUS for idx, steps in enumerate(cfg.TRAIN.LR_SCHEDULE[:-1]): mult = 0.1**(idx + 1) lr_schedule.append( (steps * factor // stepnum, cfg.TRAIN.BASE_LR * mult)) logger.info("Warm Up Schedule (steps, value): " + str(warmup_schedule)) logger.info("LR Schedule (epochs, value): " + str(lr_schedule)) train_dataflow = get_train_dataflow() # This is what's commonly referred to as "epochs" total_passes = cfg.TRAIN.LR_SCHEDULE[-1] * 8 / train_dataflow.size() logger.info( "Total passes of the training set is: {:.5g}".format(total_passes)) # Create callbacks ... callbacks = [ PeriodicCallback(ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1), every_k_epochs=cfg.TRAIN.CHECKPOINT_PERIOD), # linear warmup ScheduledHyperParamSetter('learning_rate', warmup_schedule, interp='linear', step_based=True), ScheduledHyperParamSetter('learning_rate', lr_schedule), GPUMemoryTracker(), HostMemoryTracker(), ThroughputTracker(samples_per_step=cfg.TRAIN.NUM_GPUS), EstimatedTimeLeft(median=True), SessionRunTimeout(60000) # 1 minute timeout #AMLCallback() #GPUUtilizationTracker() ] if cfg.TRAIN.EVAL_PERIOD > 0: callbacks.extend([ EvalCallback(dataset, *MODEL.get_inference_tensor_names(), args.logdir) for dataset in cfg.DATA.VAL ]) if is_horovod and hvd.rank() > 0: session_init = None else: if args.load: # ignore mismatched values, so you can `--load` a model for fine-tuning session_init = SmartInit(args.load, ignore_mismatch=True) else: session_init = SmartInit(cfg.BACKBONE.WEIGHTS) traincfg = TrainConfig(model=MODEL, data=QueueInput(train_dataflow), callbacks=callbacks, monitors=[AMLMonitor()], steps_per_epoch=stepnum, max_epoch=cfg.TRAIN.LR_SCHEDULE[-1] * factor // stepnum, session_init=session_init, starting_epoch=cfg.TRAIN.STARTING_EPOCH) if is_horovod: trainer = HorovodTrainer(average=False) else: # nccl mode appears faster than cpu mode trainer = SyncMultiGPUTrainerReplicated(cfg.TRAIN.NUM_GPUS, average=False, mode='nccl') launch_train_with_config(traincfg, trainer)
# Compute the training schedule from the number of GPUs ... stepnum = cfg.TRAIN.STEPS_PER_EPOCH # warmup is step based, lr is epoch based init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.) warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)] warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)] factor = 8. / cfg.TRAIN.NUM_GPUS for idx, steps in enumerate(cfg.TRAIN.LR_SCHEDULE[:-1]): mult = 0.1**(idx + 1) lr_schedule.append( (steps * factor // stepnum, cfg.TRAIN.BASE_LR * mult)) logger.info("Warm Up Schedule (steps, value): " + str(warmup_schedule)) logger.info("LR Schedule (epochs, value): " + str(lr_schedule)) train_dataflow = get_train_dataflow() # This is what's commonly referred to as "epochs" total_passes = cfg.TRAIN.LR_SCHEDULE[-1] * 8 / train_dataflow.size() logger.info( "Total passes of the training set is: {:.5g}".format(total_passes)) # Create callbacks ... callbacks = [ PeriodicCallback(ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1), every_k_epochs=cfg.TRAIN.CHECKPOINT_PERIOD), # linear warmup ScheduledHyperParamSetter('learning_rate', warmup_schedule, interp='linear', step_based=True),