Exemplo n.º 1
0
def main(_):
    common.initialize_preloading()
    if flags.FLAGS.use_horovod and flags.FLAGS.distribution_strategy != "off":
        raise RuntimeError(
            "Horovod and distribution strategy cannot be used together. Please select one of the scaleout methods."
        )
    if flags.FLAGS.distribution_strategy not in ["off", "hpu"]:
        raise RuntimeError(
            "Currently HPU supports only HPUStrategy, please set --distribution_strategy=hpu or use horovod"
        )
    if flags.FLAGS.use_horovod:
        if flags.FLAGS.horovod_hierarchical_allreduce:
            os.environ['HOROVOD_HIERARCHICAL_ALLREDUCE'] = "1"
        hvd_init()
    else:
        synapse_logger_init()
    load_habana_module()

    if flags.FLAGS.global_seed:
        tf.random.set_seed(flags.FLAGS.global_seed)

    with dump_callback():
        model_helpers.apply_clean(flags.FLAGS)
        with logger.benchmark_context(flags.FLAGS):
            stats = run(flags.FLAGS)
        logging.info('Run stats:\n%s', stats)
Exemplo n.º 2
0
def hvd_try_init():
    global IS_HVD_INIT
    if not IS_HVD_INIT and horovod_enabled():
        hvd_init()
        IS_HVD_INIT = True

        tf.get_logger().propagate = False
        if hvd.rank() == 0:
            tf.logging.set_verbosity('INFO')
        else:
            tf.logging.set_verbosity('WARN')
Exemplo n.º 3
0
def main(_):
    common.initialize_preloading()
    if flags.FLAGS.use_horovod:
        hvd_init()
    else:
        synapse_logger_init()
    load_habana_module()

    with dump_callback():
        model_helpers.apply_clean(flags.FLAGS)
        with logger.benchmark_context(flags.FLAGS):
            stats = run(flags.FLAGS)
        logging.info('Run stats:\n%s', stats)
Exemplo n.º 4
0
def benchmark_demo_data_loader(data_dir,
                               batch_size=256,
                               bfloat16=True,
                               max_train_steps=1024,
                               datasets_num_private_threads=False,
                               allow_control_edges=True,
                               enable_profiling=False,
                               profiling_iter_cnt=20,
                               profiling_warmup_steps=20,
                               experimental_preloading=1):
    hooks = setup_hooks(enable_profiling,
                        profiling_iter_cnt,
                        profiling_warmup_steps=profiling_warmup_steps)

    setup_env(allow_control_edges, bfloat16)

    hvd_init()

    MonkeypatchStub.setattr(imagenet_main, "_NUM_TRAIN_FILES",
                            max_train_file_number(data_dir))

    MonkeypatchStub.setattr(imagenet_main, "ImagenetModel", ImagenetModelMock)

    tf.compat.v1.enable_resource_variables()

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    imagenet_main.define_imagenet_flags()
    flags.DEFINE_boolean("mini_imagenet", False, "mini ImageNet")

    argv = [
        "test", f"--model_dir=rank_{comm_rank()}", "--num_gpus=1",
        f"--data_dir={data_dir}", "--distribution_strategy=off",
        "--data_format=channels_last", f"--batch_size={batch_size}",
        "--return_before_eval=true", "--display_steps=100",
        "--use_horovod=true",
        f"--experimental_preloading={experimental_preloading}"
    ]

    if hooks:
        argv.append(f"--hooks={hooks}")

    if datasets_num_private_threads:
        argv.append(
            f"--datasets_num_private_threads={datasets_num_private_threads}")
    if max_train_steps:
        argv.append(f"--max_train_steps={max_train_steps}")

    flags.FLAGS(argv)

    imagenet_main.run_imagenet(flags.FLAGS)
Exemplo n.º 5
0
def run_imagenet(flags_obj):
    """Run ResNet ImageNet training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results` and
      `train_hooks`. `eval_results` contains accuracy (top_1) and
      accuracy_top_5. `train_hooks` is a list the instances of hooks used during
      training.
  """
    input_function = (flags_obj.use_synthetic_data and get_synth_input_fn(
        flags_core.get_tf_dtype(flags_obj)) or input_fn)

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    if flags.FLAGS.dtype == 'bf16':
        os.environ['TF_BF16_CONVERSION'] = flags.FLAGS.bf16_config_path

    # Disabling dynamic shapes is a workaround. Dynamic shapes support for ResNeXt shall be investigated
    os.environ["TF_ENABLE_DYNAMIC_SHAPES"] = "false"

    os.environ.setdefault("TF_DISABLE_MKL", "1")
    os.environ.setdefault("TF_ALLOW_CONTROL_EDGES_IN_HABANA_OPS", "1")

    if flags_obj.use_horovod:
        assert flags_obj.no_hpu == False, "Horovod without HPU is not supported in helpers."
        hvd_init()
    else:
        synapse_logger_init()

    if not flags_obj.no_hpu:
        load_habana_module()

    result = resnet_run_loop.resnet_main(
        flags_obj,
        imagenet_model_fn,
        input_function,
        DATASET_NAME,
        shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS])

    return result
Exemplo n.º 6
0
def run_imagenet(flags_obj):
    """Run ResNet ImageNet training and eval loop.

  Args:
    flags_obj: An object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results` and
      `train_hooks`. `eval_results` contains accuracy (top_1) and
      accuracy_top_5. `train_hooks` is a list the instances of hooks used during
      training.
  """
    input_function = (flags_obj.use_synthetic_data and get_synth_input_fn(
        flags_core.get_tf_dtype(flags_obj)) or input_fn)

    if flags.FLAGS.is_mlperf_enabled:
        tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
    else:
        tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    if flags_obj.use_horovod:
        assert flags_obj.no_hpu == False, "Horovod without HPU is not supported in helpers."
        hvd_init()
    else:
        synapse_logger_init()

    if flags.FLAGS.is_mlperf_enabled:
        resnet_run_loop.init_mllog_mlloger()

    if not flags_obj.no_hpu:
        log_info_devices = load_habana_module()
        print(f"Devices:\n {log_info_devices}")

    result = resnet_run_loop.resnet_main(
        flags_obj,
        imagenet_model_fn,
        input_function,
        DATASET_NAME,
        shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS])

    return result
Exemplo n.º 7
0
def main():
    """
    Starting point of the application
    """
    params = parse_args(description="UNet-medical")
    if params.use_horovod:
        hvd_init()
    set_flags(params)

    model_dir = prepare_model_dir(params)
    params.model_dir = model_dir
    logger = get_logger(params)

    tb_logger = None
    if params.tensorboard_logging:
        log_dir = params.log_dir
        if horovod_enabled() and params.log_all_workers:
            log_dir = os.path.join(log_dir, f'worker_{hvd_rank()}')
        tb_logger = namedtuple('TBSummaryWriters', 'train_writer eval_writer')(
            tf.summary.create_file_writer(log_dir),
            tf.summary.create_file_writer(os.path.join(log_dir, 'eval')))

    model = Unet()

    dataset = Dataset(data_dir=params.data_dir,
                      batch_size=params.batch_size,
                      fold=params.fold,
                      augment=params.augment,
                      hpu_id=hvd_rank() if horovod_enabled() else 0,
                      num_hpus=hvd_size() if horovod_enabled() else 1,
                      seed=params.seed)

    if 'train' in params.exec_mode:
        with dump_callback(params.dump_config):
            train(params, model, dataset, logger, tb_logger)

    if 'evaluate' in params.exec_mode:
        evaluate(params, model, dataset, logger, tb_logger)

    if 'predict' in params.exec_mode:
        predict(params, model, dataset, logger)
Exemplo n.º 8
0
def setup_horovod(params):
    params.hvd = None
    params.worker_id = 0
    params.num_workers = 1
    if params.use_horovod:
        if params.no_hpu:
            # Horovod on GPU
            import horovod.tensorflow as hvd
            hvd.init()
            os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())
        else:
            from TensorFlow.common.horovod_helpers import hvd, hvd_init
            hvd_init()
        params.worker_id = hvd.rank()
        params.num_workers = hvd.size()
        params.hvd = hvd
        if params.log_all_workers:
            params.log_dir = os.path.join(params.log_dir,
                                          f'worker_{params.worker_id}')
            params.model_dir = os.path.join(params.model_dir,
                                            f'worker_{params.worker_id}')

    return params
Exemplo n.º 9
0
    def __init__(self, runtime_config, model_fn):
        super(EstimatorExecuter, self).__init__(runtime_config, model_fn)

        # Handle recipe cache. Skip if externally set or empty.
        recipe_cache = runtime_config.recipe_cache
        if 'TF_RECIPE_CACHE_PATH' not in os.environ.keys() and recipe_cache:
            os.environ['TF_RECIPE_CACHE_PATH'] = recipe_cache

            # Clear previous recipe cache.
            if not MPI_is_distributed() or MPI_rank() == 0:
                if os.path.exists(recipe_cache) and os.path.isdir(
                        recipe_cache):
                    shutil.rmtree(recipe_cache)

        if MPI_is_distributed():
            os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
            os.environ['HOROVOD_NUM_NCCL_STREAMS'] = '1'
            # os.environ['HOROVOD_AUTOTUNE'] = '2'

            if runtime_config.device == "HPU":
                from TensorFlow.common.horovod_helpers import hvd_init, Framework
                hvd = hvd_init(framework=Framework.TENSORFLOW)
            else:
                hvd.init()

            # Other ranks should wait for recipe cache to be removed.
            # This operation can't be done before hvd_init.
            from mpi4py import MPI
            MPI.COMM_WORLD.Barrier()

            logging.info("Horovod successfully initialized ...")

        os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
        os.environ['TF_GPU_THREAD_COUNT'] = '1' if not MPI_is_distributed(
        ) else str(hvd.size())

        os.environ['TF_SYNC_ON_FINISH'] = '0'
Exemplo n.º 10
0
def main(argv):
  tf.disable_v2_behavior()
  tf.enable_resource_variables()

  if FLAGS.use_hpu and FLAGS.recipe_cache:
    prepare_recipe_cache()

  if FLAGS.use_horovod:
    if FLAGS.use_hpu:
      from TensorFlow.common.horovod_helpers import hvd_init, horovod_enabled, hvd
      hvd_init()
      assert horovod_enabled()
      if FLAGS.recipe_cache:
        # Other ranks should wait for recipe cache to be removed.
        # This operation can't be done before hvd_init.
        from mpi4py import MPI
        MPI.COMM_WORLD.Barrier()
    else:
      import horovod.tensorflow as hvd
      hvd.init()
      assert hvd.size() > 1
      os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())

  if FLAGS.use_hpu:
    if FLAGS.use_bf16:
      os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path

    dyn_shapes_flag = 'TF_ENABLE_DYNAMIC_SHAPES'
    if dyn_shapes_flag not in os.environ:
        os.environ[dyn_shapes_flag] = 'false'

    from habana_frameworks.tensorflow import load_habana_module  # noqa
    load_habana_module()

  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

  # If we just have to print the registry, do that and exit early.
  maybe_log_registry_and_exit()

  # Create HParams.
  if argv:
    set_hparams_from_args(argv[1:])
  if FLAGS.schedule != "run_std_server":
    hparams = create_hparams()
  if FLAGS.gpu_automatic_mixed_precision:
    setattr(hparams, "gpu_automatic_mixed_precision", True)
  if FLAGS.deterministic_dataset:
    hparams.add_hparam("deterministic_dataset", True)

  hparams.add_hparam("use_horovod", FLAGS.use_horovod)
  hparams.add_hparam("use_hpu", FLAGS.use_hpu)
  if FLAGS.use_horovod:
    hparams.add_hparam("hvd_worker_id", hvd.rank())
    hparams.add_hparam("hvd_size", hvd.size())

  if FLAGS.schedule == "run_std_server":
    run_std_server()
  trainer_lib.set_random_seed(FLAGS.random_seed)

  if FLAGS.generate_data:
    generate_data()

  exp_fn = create_experiment_fn()
  exp = exp_fn(create_run_config(hparams), hparams)
  if is_chief():
    save_metadata(hparams)

  with dump_callback():
    execute_schedule(exp)
Exemplo n.º 11
0
        serving_input_receiver_fn=squad_serving_input_fn)

    tf.logging.info("Starting to export TFLite.")
    converter = tf.lite.TFLiteConverter.from_saved_model(
        subfolder,
        input_arrays=["input_ids", "input_mask", "segment_ids"],
        output_arrays=["start_logits", "end_logits"])
    float_model = converter.convert()
    tflite_file = os.path.join(FLAGS.export_dir, "albert_model.tflite")
    with tf.gfile.GFile(tflite_file, "wb") as f:
      f.write(float_model)


if __name__ == "__main__":

  if FLAGS.deterministic_run:
    tensorflow.random.set_seed(1)
    tf.compat.v1.set_random_seed(1)

  if FLAGS.use_horovod:
    hvd_init()

  load_habana_module()


  flags.mark_flag_as_required("spm_model_file")
  flags.mark_flag_as_required("albert_config_file")
  flags.mark_flag_as_required("output_dir")

  tf.app.run()
def main(_):
    #tf.disable_v2_behavior() ###
    tf.compat.v1.disable_eager_execution()
    tf.compat.v1.enable_resource_variables()

    # Enable habana bf16 conversion pass
    if FLAGS.dtype == 'bf16':
        os.environ['TF_BF16_CONVERSION'] = flags.FLAGS.bf16_config_path
        FLAGS.precision = 'bf16'
    else:
        os.environ['TF_BF16_CONVERSION'] = "0"

    if FLAGS.use_horovod:
        hvd_init()

    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        #######################
        # Config model_deploy #
        #######################
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=FLAGS.task,
            num_replicas=FLAGS.worker_replicas,
            num_ps_tasks=FLAGS.num_ps_tasks)

        # Create global_step
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ######################
        # Select the network #
        ######################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            weight_decay=FLAGS.weight_decay,
            is_training=True)

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name,
            is_training=True,
            use_grayscale=FLAGS.use_grayscale)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        with tf.device(deploy_config.inputs_device()):
            provider = slim.dataset_data_provider.DatasetDataProvider(
                dataset,
                num_readers=FLAGS.num_readers,
                common_queue_capacity=20 * FLAGS.batch_size,
                common_queue_min=10 * FLAGS.batch_size)
            [image, label] = provider.get(['image', 'label'])
            label -= FLAGS.labels_offset

            train_image_size = FLAGS.train_image_size or network_fn.default_image_size

            image = image_preprocessing_fn(image, train_image_size,
                                           train_image_size)

            images, labels = tf.train.batch(
                [image, label],
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=5 * FLAGS.batch_size)
            labels = slim.one_hot_encoding(
                labels, dataset.num_classes - FLAGS.labels_offset)
            batch_queue = slim.prefetch_queue.prefetch_queue(
                [images, labels], capacity=2 * deploy_config.num_clones)

        ####################
        # Define the model #
        ####################
        def clone_fn(batch_queue):
            """Allows data parallelism by creating multiple clones of network_fn."""
            images, labels = batch_queue.dequeue()
            logits, end_points = network_fn(images)

            #############################
            # Specify the loss function #
            #############################
            if 'AuxLogits' in end_points:
                slim.losses.softmax_cross_entropy(
                    end_points['AuxLogits'],
                    labels,
                    label_smoothing=FLAGS.label_smoothing,
                    weights=0.4,
                    scope='aux_loss')
            slim.losses.softmax_cross_entropy(
                logits,
                labels,
                label_smoothing=FLAGS.label_smoothing,
                weights=1.0)
            return end_points

        # Gather initial summaries.

        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        clones = model_deploy.create_clones(deploy_config, clone_fn,
                                            [batch_queue])
        first_clone_scope = deploy_config.clone_scope(0)
        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by network_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        # Add summaries for end_points.
        end_points = clones[0].outputs

        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.summary.histogram('activations/' + end_point, x))
            summaries.add(
                tf.summary.scalar('sparsity/' + end_point,
                                  tf.nn.zero_fraction(x)))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))

        #################################
        # Configure the moving averages #
        #################################
        if FLAGS.moving_average_decay:
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, global_step)
        else:
            moving_average_variables, variable_averages = None, None

        #if FLAGS.quantize_delay >= 0:
        #  quantize.create_training_graph(quant_delay=FLAGS.quantize_delay) #for debugging!!

        #########################################
        # Configure the optimization procedure. #
        #########################################
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = _configure_learning_rate(dataset.num_samples,
                                                     global_step)
            optimizer = _configure_optimizer(learning_rate)
            summaries.add(tf.summary.scalar('learning_rate', learning_rate))

        if FLAGS.sync_replicas:
            # If sync_replicas is enabled, the averaging will be done in the chief
            # queue runner.
            optimizer = tf.train.SyncReplicasOptimizer(
                opt=optimizer,
                replicas_to_aggregate=FLAGS.replicas_to_aggregate,
                total_num_replicas=FLAGS.worker_replicas,
                variable_averages=variable_averages,
                variables_to_average=moving_average_variables)
        elif FLAGS.moving_average_decay:
            # Update ops executed locally by trainer.
            update_ops.append(
                variable_averages.apply(moving_average_variables))

        # Variables to train.
        variables_to_train = _get_variables_to_train()

        #  and returns a train_tensor and summary_op
        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones, optimizer, var_list=variables_to_train)

        # Create gradient updates.
        grad_updates = optimizer.apply_gradients(clones_gradients,
                                                 global_step=global_step)
        update_ops.append(grad_updates)

        update_op = tf.group(*update_ops)
        with tf.control_dependencies([update_op]):
            train_tensor = tf.identity(total_loss, name='train_op')

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        if horovod_enabled():
            hvd.broadcast_global_variables(0)
        ###########################
        # Kicks off the training. #
        ###########################
        with dump_callback():
            with logger.benchmark_context(FLAGS):
                eps1 = ExamplesPerSecondKerasHook(FLAGS.log_every_n_steps,
                                                  output_dir=FLAGS.train_dir,
                                                  batch_size=FLAGS.batch_size)

                write_hparams_v1(
                    eps1.writer, {
                        'batch_size': FLAGS.batch_size,
                        **{x: getattr(FLAGS, x)
                           for x in FLAGS}
                    })

                train_step_kwargs = {}
                if FLAGS.max_number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, FLAGS.max_number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                if FLAGS.log_every_n_steps > 0:
                    train_step_kwargs['should_log'] = math_ops.equal(
                        math_ops.mod(global_step, FLAGS.log_every_n_steps), 0)

                eps1.on_train_begin()
                train_step_kwargs['EPS'] = eps1

                slim.learning.train(
                    train_tensor,
                    logdir=FLAGS.train_dir,
                    train_step_fn=train_step1,
                    train_step_kwargs=train_step_kwargs,
                    master=FLAGS.master,
                    is_chief=(FLAGS.task == 0),
                    init_fn=_get_init_fn(),
                    summary_op=summary_op,
                    summary_writer=None,
                    number_of_steps=FLAGS.max_number_of_steps,
                    log_every_n_steps=FLAGS.log_every_n_steps,
                    save_summaries_secs=FLAGS.save_summaries_secs,
                    save_interval_secs=FLAGS.save_interval_secs,
                    sync_optimizer=optimizer if FLAGS.sync_replicas else None)
Exemplo n.º 13
0
def run_coco(args):
    print("Command: ", args.command)
    print("Model: ", args.model)
    print("Dataset: ", args.dataset)
    print("Year: ", args.year)
    print("Logs: ", args.logs)
    print("Auto Download: ", args.download)

    ############################################################
    #  Configurations
    ############################################################
    if args.deterministic:
        tf.config.threading.set_inter_op_parallelism_threads(1)
        tf.config.threading.set_intra_op_parallelism_threads(1)
        tf.reset_default_graph()
        SEED = 0
        os.environ['PYTHONHASHSEED'] = str(SEED)
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        random.seed(SEED)
        np.random.seed(SEED)
        tf.set_random_seed(SEED)

    is_master = True
    hvd = None

    if args.gpus < 0:
        config = tf.ConfigProto(device_count={'GPU': 0})
        K.set_session(tf.Session(config=config))
        print('running on cpu')

    if args.using_horovod and args.command == "train":
        if args.device in ['HPU']:
            from TensorFlow.common.horovod_helpers import hvd_init, Framework
            hvd = hvd_init(framework=Framework.KERAS)
        else:
            import horovod.tensorflow.keras as hvd
            hvd.init()
            confighorovod = tf.ConfigProto()
            confighorovod.gpu_options.visible_device_list = str(
                hvd.local_rank())
            K.set_session(tf.Session(config=confighorovod))
        is_master = hvd.local_rank() == 0
        if not is_master:
            tf.get_logger().setLevel(tf.logging.FATAL)

    elif args.using_horovod and args.command == "evaluate":
        if args.device in ['HPU']:
            from TensorFlow.common.horovod_helpers import hvd_init, Framework
            hvd = hvd_init(framework=Framework.KERAS)
        else:
            confighorovod = tf.ConfigProto()
            confighorovod.gpu_options.visible_device_list = str(args.gpus)
            K.set_session(tf.Session(config=confighorovod))
        is_master = hvd.local_rank() == 0
        if not is_master:
            tf.get_logger().setLevel(tf.logging.FATAL)

    if args.device in ['HPU']:
        from TensorFlow.common.library_loader import load_habana_module
        load_habana_module()

    dev_str = f'/device:{args.device}:0'
    print(f'Selected device: {dev_str}')

    class CocoConfig(Config):
        """Configuration for training on MS COCO.
        Derives from the base Config class and overrides values specific
        to the COCO dataset.
        """
        # Give the configuration a recognizable name
        NAME = "coco"
        if hvd:
            _GPU_COUNT = hvd.size()
            GPU_COUNT = 1  #fix batch size as IMAGES_PER_GPU
        else:
            _GPU_COUNT = abs(args.gpus)
            GPU_COUNT = _GPU_COUNT

        if args.fchollet_fix:
            BGR = True
            ## mean pixel is in RGB format to match original settings
            MEAN_PIXEL = [123.68, 116.78, 103.94]
        elif args.BGR or 'kapp_' in args.backbone:
            ## BGR/caffe format
            BGR = True
            MEAN_PIXEL = [103.94, 116.78, 123.68]
        else:
            ## default RGB mode
            BGR = False
            MEAN_PIXEL = [123.68, 116.78, 103.94]

        GT_NOISE_STD = 0

        QUICK_TEST = args.quick_test
        ## these can be used to run with dynamic shapes
        BIN_PADDING = None  # 8
        IMAGE_RESIZE_MODE = "square"  # "pad64"
        DYNAMIC_ANCHORS = False  # True
        PRESET_LAYERS_TRAIN = args.train_layers
        if args.dynamic:
            IMAGE_RESIZE_MODE = "pad64"
            DYNAMIC_ANCHORS = True

        if BIN_PADDING or IMAGE_RESIZE_MODE in ['no_pad', 'pad64'
                                                ] or QUICK_TEST:
            IMAGES_PER_GPU = 1
        else:
            IMAGES_PER_GPU = 4
        # Override if specified.
        if args.images_per_gpu is not None:
            IMAGES_PER_GPU = args.images_per_gpu
        # always evaluate using same number of samples regardless of number of gpus
        VAL_SAMPLES = 1600
        if QUICK_TEST:
            VAL_SAMPLES = 1
        _BATCH_SIZE = _GPU_COUNT * IMAGES_PER_GPU
        VALIDATION_STEPS = None  # VAL_SAMPLES//_BATCH_SIZE
        if args.validation_steps is not None:
            VALIDATION_STEPS = args.validation_steps
        # lr is scaled with respect to the actual number of gpus
        LEARNING_RATE = 0.02 * (_BATCH_SIZE / 16)**0.5
        DETERMINISTIC = args.deterministic
        if args.deterministic:
            LEARNING_RATE = 0
        STEPS_PER_EPOCH = None  # 5000
        PYRAMID_ROI_CUSTOM_OP = int(args.custom_roi)
        LEARNING_MOMENTUM_CONST = True if args.momentum_const == '1' else False
        COMBINED_NMS_OP = True if args.combined_nms == '1' else False
        USE_VALID_BOXES = args.use_valid_boxes
        if args.xl_inputs:
            TRAIN_ROIS_PER_IMAGE = 512
            ROI_POSITIVE_RATIO = 0.25
            IMAGE_MIN_DIM_TRAIN = [640, 672, 704, 736, 768, 800, 832]
            IMAGE_MIN_DIM_VAL = 832
            IMAGE_MAX_DIM = 1344
        else:
            TRAIN_ROIS_PER_IMAGE = 256
            ROI_POSITIVE_RATIO = 0.33
            IMAGE_MIN_DIM_TRAIN = [640, 672, 704, 736, 768, 800]
            IMAGE_MIN_DIM_VAL = 800
            IMAGE_MAX_DIM = 1024
        if QUICK_TEST:
            TRAIN_ROIS_PER_IMAGE = 20
            IMAGE_MAX_DIM = 512
        if args.clip_norm > 0:
            GRADIENT_CLIP_NORM = args.clip_norm
        else:
            GRADIENT_CLIP_NORM = None
        # Number of classes (including background)
        NUM_CLASSES = 1 + 80  # COCO has 80 classes
        BACKBONE = args.backbone
        RPN_ONLY = args.rpn_only
        ### schedual settings
        WARMUP = 1000
        if args.warmup_steps is not None:
            WARMUP = args.warmup_steps
        if QUICK_TEST:
            WARMUP = 1
        if RPN_ONLY:
            DROPS = [40, 60]
            TOT_EPOCHS = 70
        else:
            if args.short:  ## short regime
                DROPS = [77, 154]
                TOT_EPOCHS = 175
            else:  ## long regime
                DROPS = [210, 280]
                TOT_EPOCHS = 300

        if args.epochs is not None:
            TOT_EPOCHS = args.epochs

        if args.steps_per_epoch is not None:
            STEPS_PER_EPOCH = args.steps_per_epoch

        if STEPS_PER_EPOCH is not None:
            _SCHEDUAL_RATIO = max(STEPS_PER_EPOCH // 1000, 1)
        else:
            _SCHEDUAL_RATIO = max((117280 // _BATCH_SIZE) // 1000, 1)
        for i, v in enumerate(DROPS):
            DROPS[i] = int(v / _SCHEDUAL_RATIO + 0.5)
        del i
        del v
        if args.epochs is None:
            TOT_EPOCHS = int(TOT_EPOCHS / _SCHEDUAL_RATIO + 0.5)

    class InferenceConfig(CocoConfig):
        # Set batch size to 1 since we'll be running inference on
        # one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
        GPU_COUNT = 1
        IMAGES_PER_GPU = 1
        DETECTION_MIN_CONFIDENCE = 0.001

    if args.command == "train":
        config = CocoConfig()
        mode = "training"
    else:
        config = InferenceConfig()
        mode = "inference"

    with tf.device("/device:CPU:0"):
        model = modellib.MaskRCNN(dev_str,
                                  mode=mode,
                                  config=config,
                                  model_dir=args.logs,
                                  hvd=hvd)

    exclude = None
    # Select weights file to load
    if args.model.lower() == "coco":
        model_path = COCO_MODEL_PATH
    elif args.model.lower() == "last":
        # Find last trained weights
        model_path = model.find_last()
    elif args.model.lower() == "imagenet":
        # Start from ImageNet trained weights
        with tf.device(dev_str):
            model_path = model.get_imagenet_weights()
    else:
        model_path = args.model
        if 'r101_imagenet_init.h5' in args.model:
            exclude = r"(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)|(anchors.*)|(mask\_.*)|"

    # Load weights
    if is_master:
        config.display()
        model.keras_model.summary()
        print("Loading weights", model_path)
    if 'keras' not in args.model:
        # keras backbone weights are automatically loaded during build
        with tf.device(dev_str):
            model.load_weights(model_path,
                               by_name=True,
                               exclude=exclude,
                               resume=args.resume,
                               verbose=is_master)
    # Train or evaluate
    if args.command == "train":
        # Training dataset. Use the training set and 35K from the
        # validation set, as as in the Mask RCNN paper.
        num_shards = 1
        shard_id = 0
        if hvd:
            num_shards = hvd.local_size()
            shard_id = hvd.local_rank()
        dataset_train = CocoDataset()
        dataset_train.load_coco(args.dataset,
                                "train",
                                year=args.year,
                                auto_download=args.download,
                                num_shards=num_shards,
                                shard_id=shard_id)

        if args.year in '2014':
            dataset_train.load_coco(args.dataset,
                                    "valminusminival",
                                    year=args.year,
                                    auto_download=args.download,
                                    num_shards=num_shards,
                                    shard_id=shard_id)

        dataset_train.prepare()
        # Validation dataset
        dataset_val = CocoDataset()
        val_type = "val" if args.year in '2017' else "minival"
        dataset_val.load_coco(args.dataset,
                              val_type,
                              year=args.year,
                              auto_download=args.download,
                              num_shards=num_shards,
                              shard_id=shard_id,
                              limit=config.VAL_SAMPLES)
        dataset_val.prepare()

        augmentation = iaa.Fliplr(0.5)
        callbacks = []

        ## add callbacks here
        schedule = COCOScheduler(config.LEARNING_RATE,
                                 warmup_steps=config.WARMUP,
                                 gamma=0.1,
                                 drops=config.DROPS,
                                 verbose=is_master)
        callbacks += [schedule]

        external_callbacks = getattr(args, 'external_callbacks', None)
        if external_callbacks is not None:
            callbacks.extend(external_callbacks)

        if is_master:
            print("Training Resnet stage 3+nobn")
        with tf.device("/device:CPU:0"):
            model.train(dev_str,
                        dataset_train,
                        dataset_val,
                        learning_rate=config.LEARNING_RATE,
                        epochs=config.TOT_EPOCHS,
                        layers=config.PRESET_LAYERS_TRAIN,
                        augmentation=augmentation,
                        custom_callbacks=callbacks,
                        dump_tf_timeline=args.dump_tf_timeline,
                        disable_validation=args.disable_validation)

    elif args.command == "evaluate":
        # Validation dataset
        dataset_val = CocoDataset()
        val_type = "val" if args.year in '2017' else "minival"
        coco = dataset_val.load_coco(
            args.dataset,
            val_type,
            year=args.year,
            return_coco=True,
            auto_download=args.download,
            limit=args.limit if args.limit > 0 else None)
        dataset_val.prepare()
        print("Running COCO evaluation on {} images.".format(
            len(dataset_val.image_info)))
        evaluate_coco(model, dataset_val, coco)
    else:
        print("'{}' is not recognized. "
              "Use 'train' or 'evaluate'".format(args.command))
Exemplo n.º 14
0
def main(argv):
    del argv  # Unused.

    # if given an efficentdet ckpt don't use default backbone ckpt
    if FLAGS.backbone_ckpt == BACKBONE_CKPT_DEFAULT_DIR and FLAGS.ckpt is not None:
        print("Using ckpt flag: {}, ignoring default backbone_ckpt: {}".format(
            FLAGS.ckpt, FLAGS.backbone_ckpt))
        FLAGS.backbone_ckpt = None

    if FLAGS.use_horovod is not None:
        if FLAGS.dump_all_ranks:
            FLAGS.model_dir += "/worker_" + str(hvd.rank())
        if not 'HOROVOD_CYCLE_TIME' in os.environ:
            os.environ['HOROVOD_CYCLE_TIME'] = '0.5'
        if not 'HABANA_HCCL_COMM_API' in os.environ:
            os.environ['HABANA_HCCL_COMM_API'] = '0'
        hvd_init()

    if not FLAGS.no_hpu:
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

        if FLAGS.use_horovod:
            assert (horovod_enabled())

    set_env(use_amp=FLAGS.use_amp)

    # deterministic setting
    if FLAGS.sbs_test or FLAGS.deterministic:
        set_deterministic()

    # Check data path
    if FLAGS.mode in (
            'train', 'train_and_eval') and FLAGS.training_file_pattern is None:
        raise RuntimeError(
            'You must specify --training_file_pattern for training.')
    if FLAGS.mode in ('eval', 'train_and_eval'):
        if FLAGS.validation_file_pattern is None:
            raise RuntimeError('You must specify --validation_file_pattern '
                               'for evaluation.')
        if not FLAGS.val_json_file and not FLAGS.testdev_dir:
            raise RuntimeError(
                'You must specify --val_json_file or --testdev for evaluation.'
            )

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)

    # The following is for spatial partitioning. `features` has one tensor while
    # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input
    # partition is performed on `features` and all partitionable tensors of
    # `labels`, see the partition logic below.
    # In the TPUEstimator context, the meaning of `shard` and `replica` is the
    # same; follwing the API, here has mixed use of both.
    if FLAGS.use_spatial_partition:
        # Checks input_partition_dims agrees with num_cores_per_replica.
        if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims):
            raise RuntimeError(
                '--num_cores_per_replica must be a product of array'
                'elements in --input_partition_dims.')

        labels_partition_dims = {
            'mean_num_positives': None,
            'source_ids': None,
            'groundtruth_data': None,
            'image_scales': None,
        }
        # The Input Partition Logic: We partition only the partition-able tensors.
        # Spatial partition requires that the to-be-partitioned tensors must have a
        # dimension that is a multiple of `partition_dims`. Depending on the
        # `partition_dims` and the `image_size` and the `max_level` in config, some
        # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot
        # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image
        # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of
        # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this
        # case, the level-8 and level-9 target tensors are not partition-able, and
        # the highest partition-able level is 7.
        image_size = config.get('image_size')
        for level in range(config.get('min_level'),
                           config.get('max_level') + 1):

            def _can_partition(spatial_dim):
                partitionable_index = np.where(
                    spatial_dim % np.array(FLAGS.input_partition_dims) == 0)
                return len(partitionable_index[0]) == len(
                    FLAGS.input_partition_dims)

            spatial_dim = image_size // (2**level)
            if _can_partition(spatial_dim):
                labels_partition_dims['box_targets_%d' %
                                      level] = FLAGS.input_partition_dims
                labels_partition_dims['cls_targets_%d' %
                                      level] = FLAGS.input_partition_dims
            else:
                labels_partition_dims['box_targets_%d' % level] = None
                labels_partition_dims['cls_targets_%d' % level] = None
        num_cores_per_replica = FLAGS.num_cores_per_replica
        input_partition_dims = [
            FLAGS.input_partition_dims, labels_partition_dims
        ]
        num_shards = FLAGS.num_cores // num_cores_per_replica
    else:
        num_cores_per_replica = None
        input_partition_dims = None
        num_shards = FLAGS.num_cores
        if horovod_enabled():
            num_shards = hvd.size()
        else:
            num_shards = 1

    params = build_estimator_params('train', config, num_shards)
    # disabling input data scaling/flip manipulations.
    if FLAGS.sbs_test:
        sbs_params = dict(input_rand_hflip=False,
                          train_scale_min=1,
                          train_scale_max=1,
                          dropout_rate=0.0)
        params.update(sbs_params)

    tf_random_seed = 0 if FLAGS.deterministic else None
    run_config = build_estimator_config('train', config, num_shards,
                                        num_cores_per_replica,
                                        input_partition_dims)
    write_hparams_v1(FLAGS.model_dir, {
        'batch_size': FLAGS.train_batch_size,
        **FLAGS.flag_values_dict()
    })

    model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name)

    # TPU Estimator
    logging.info(params)

    if FLAGS.mode == 'train':
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=run_config,
                                           params=params)

        # for deterministic input, we pass to dataloader False for not manipulating input data
        is_training = not FLAGS.deterministic
        use_fake_data = FLAGS.use_fake_data or FLAGS.deterministic

        input_fn = dataloader.InputReader(FLAGS.training_file_pattern,
                                          is_training=is_training,
                                          params=params,
                                          use_fake_data=use_fake_data,
                                          is_deterministic=FLAGS.deterministic)
        max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                        (FLAGS.train_batch_size * num_shards)) + 1

        # for sbs test, train under sbs callbacks
        if FLAGS.sbs_test:
            from TensorFlow.common.debug import dump_callback
            SBS_TEST_CONFIG = os.path.join(
                os.environ['TF_TESTS_ROOT'],
                "tests/tf_training_tests/side_by_side/topologies/efficientdet/dump_config.json"
            )
            with dump_callback(SBS_TEST_CONFIG):
                train_estimator.train(input_fn=input_fn, max_steps=max_steps)
        else:
            if FLAGS.ckpt is not None:
                train_estimator.train(input_fn=input_fn, steps=max_steps)
            else:
                train_estimator.train(input_fn=input_fn, max_steps=max_steps)

    elif FLAGS.mode == 'eval':
        eval_params = build_estimator_params('eval', config, num_shards)
        eval_config = build_estimator_config('eval', config, num_shards,
                                             num_cores_per_replica,
                                             input_partition_dims)

        # Eval only runs on CPU or GPU host with batch_size = 1.
        # Override the default options: disable randomization in the input pipeline
        # and don't run on the TPU.
        # Also, disable use_bfloat16 for eval on CPU/GPU.

        eval_estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn_instance,
            use_tpu=False,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            config=eval_config,
            params=eval_params)

        def terminate_eval():
            logging.info('Terminating eval after %d seconds of no checkpoints',
                         FLAGS.eval_timeout)
            return True

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            logging.info('Starting to evaluate.')
            try:
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)
                logging.info('Eval results: %s', eval_results)

                # Terminate eval job when final checkpoint is reached.
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                write_summary(eval_results, ckpt, current_step)

                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
                total_step = int(
                    (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                    FLAGS.train_batch_size)
                if current_step >= total_step:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'train_and_eval':
        train_params = build_estimator_params('train', config, num_shards)
        train_config = build_estimator_config('train', config, num_shards,
                                              num_cores_per_replica,
                                              input_partition_dims)
        train_estimator = HorovodEstimator(model_fn=model_fn_instance,
                                           model_dir=FLAGS.model_dir,
                                           config=train_config,
                                           params=train_params)

        eval_estimator = None

        for cycle in range(FLAGS.num_epochs):
            logging.info('Starting training cycle, epoch: %d.', cycle)

            train_estimator.train(
                input_fn=dataloader.InputReader(
                    FLAGS.training_file_pattern,
                    is_training=True,
                    use_fake_data=FLAGS.use_fake_data),
                max_steps=(cycle + 1) *
                int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size))

            # synchronization point for all ranks
            if horovod_enabled():
                hvd.allreduce(tf.constant(0))

            logging.info('Starting evaluation cycle, epoch: %d.', cycle)
            # Run evaluation after every epoch.

            if eval_estimator is None:
                eval_params = build_estimator_params('eval', config,
                                                     num_shards)
                eval_config = build_estimator_config('eval', config,
                                                     num_shards,
                                                     num_cores_per_replica,
                                                     input_partition_dims)
                eval_estimator = tf.estimator.tpu.TPUEstimator(
                    model_fn=model_fn_instance,
                    use_tpu=False,
                    train_batch_size=FLAGS.train_batch_size,
                    eval_batch_size=FLAGS.eval_batch_size,
                    config=eval_config,
                    params=eval_params)

            if is_rank0():
                eval_results = eval_estimator.evaluate(
                    input_fn=dataloader.InputReader(
                        FLAGS.validation_file_pattern, is_training=False),
                    steps=FLAGS.eval_samples // FLAGS.eval_batch_size)

                checkpoint_path = Path(FLAGS.model_dir)
                last_ckpt = tf.train.latest_checkpoint(str(checkpoint_path),
                                                       latest_filename=None)
                current_step = int(os.path.basename(last_ckpt).split('-')[1])
                write_summary(eval_results, FLAGS.model_dir, current_step)
                logging.info('Evaluation results: %s', eval_results)

                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
        pass

    else:
        logging.info('Mode not found.')