Exemplo n.º 1
0
def main(unused_argv):
  flags.mark_flag_as_required('model_dir')
  flags.mark_flag_as_required('pipeline_config_path')
  tf.config.set_soft_device_placement(True)

  if FLAGS.checkpoint_dir:
    model_lib_v2.eval_continuously(
        pipeline_config_path=FLAGS.pipeline_config_path,
        model_dir=FLAGS.model_dir,
        train_steps=FLAGS.num_train_steps,
        sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
        sample_1_of_n_eval_on_train_examples=(
            FLAGS.sample_1_of_n_eval_on_train_examples),
        checkpoint_dir=FLAGS.checkpoint_dir,
        wait_interval=300, timeout=FLAGS.eval_timeout)
  else:
    if FLAGS.use_tpu:
      resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
      tf.config.experimental_connect_to_cluster(resolver)
      tf.tpu.experimental.initialize_tpu_system(resolver)
      strategy = tf.distribute.experimental.TPUStrategy(resolver)
    elif FLAGS.num_workers > 1:
      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    else:
      strategy = tf.compat.v2.distribute.MirroredStrategy()

    with strategy.scope():
      model_lib_v2.train_loop(
          pipeline_config_path=FLAGS.pipeline_config_path,
          model_dir=FLAGS.model_dir,
          train_steps=FLAGS.num_train_steps,
          use_tpu=FLAGS.use_tpu)
    def test_train_loop_then_eval_loop(self):
        """Tests that Estimator and input function are constructed correctly."""
        model_dir = tf.test.get_temp_dir()
        pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
        new_pipeline_config_path = os.path.join(model_dir,
                                                'new_pipeline.config')
        config_util.clear_fine_tune_checkpoint(pipeline_config_path,
                                               new_pipeline_config_path)
        config_kwarg_overrides = _get_config_kwarg_overrides()

        train_steps = 2
        strategy = tf2.distribute.MirroredStrategy(['/cpu:0', '/cpu:1'])
        with strategy.scope():
            model_lib_v2.train_loop(new_pipeline_config_path,
                                    model_dir=model_dir,
                                    train_steps=train_steps,
                                    checkpoint_every_n=1,
                                    **config_kwarg_overrides)

        model_lib_v2.eval_continuously(new_pipeline_config_path,
                                       model_dir=model_dir,
                                       checkpoint_dir=model_dir,
                                       train_steps=train_steps,
                                       wait_interval=1,
                                       timeout=10,
                                       **config_kwarg_overrides)
  def test_train_loop_then_eval_loop(self):
    """Tests that Estimator and input function are constructed correctly."""
    hparams = model_hparams.create_hparams(
        hparams_overrides='load_pretrained=false')
    pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
    config_kwarg_overrides = _get_config_kwarg_overrides()
    model_dir = tf.test.get_temp_dir()

    train_steps = 2
    model_lib_v2.train_loop(
        hparams,
        pipeline_config_path,
        model_dir=model_dir,
        train_steps=train_steps,
        checkpoint_every_n=1,
        **config_kwarg_overrides)

    model_lib_v2.eval_continuously(
        hparams,
        pipeline_config_path,
        model_dir=model_dir,
        checkpoint_dir=model_dir,
        train_steps=train_steps,
        wait_interval=10,
        **config_kwarg_overrides)
Exemplo n.º 4
0
 def evaluate(self):
     """
         Evaluates all Training Checkpoints.
     """
     model_lib_v2.eval_continuously(pipeline_config_path=self.config_path,
                                    model_dir=self.checkpoint_path,
                                    checkpoint_dir=self.checkpoint_path,
                                    postprocess_on_cpu=True)
Exemplo n.º 5
0
 def eval_continuously(self):
     print("Running evaluation loop...")
     strategy = tf.distribute.MirroredStrategy()
     with strategy.scope():
         model_lib_v2.eval_continuously(
             pipeline_config_path=os.path.join(self._training_loop_path,
                                               "pipeline.config"),
             model_dir=self._training_loop_path,
             checkpoint_dir=self._training_loop_path)
Exemplo n.º 6
0
def evaluate(_):
    pipeline_config = os.path.join(FLAGS.model_dir, 'pipeline.config')

    eval_continuously(pipeline_config_path=pipeline_config,
                      train_steps=FLAGS.train_steps,
                      model_dir=FLAGS.model_dir,
                      checkpoint_dir=FLAGS.model_dir,
                      wait_interval=FLAGS.wait_interval,
                      timeout=FLAGS.timeout)
def main(unused_argv):

    # ste the gpu (device:GPU:0)
    print("Num GPUs Available: ",
          len(tf.config.experimental.list_physical_devices('GPU')))
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        # Restrict TensorFlow to only use the first GPU
        try:
            tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
            tf.config.experimental.set_memory_growth(gpus[0], True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus),
                  "Logical GPU")
        except RuntimeError as e:
            # Visible devices must be set before GPUs have been initialized
            print(e)

    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('pipeline_config_path')
    tf.config.set_soft_device_placement(True)

    if FLAGS.checkpoint_dir:
        model_lib_v2.eval_continuously(
            pipeline_config_path=FLAGS.pipeline_config_path,
            model_dir=FLAGS.model_dir,
            train_steps=FLAGS.num_train_steps,
            sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
            sample_1_of_n_eval_on_train_examples=(
                FLAGS.sample_1_of_n_eval_on_train_examples),
            checkpoint_dir=FLAGS.checkpoint_dir,
            wait_interval=300,
            timeout=FLAGS.eval_timeout)
    else:
        if FLAGS.use_tpu:
            # TPU is automatically inferred if tpu_name is None and
            # we are running under cloud ai-platform.
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name)
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            strategy = tf.distribute.experimental.TPUStrategy(resolver)
        # elif FLAGS.num_workers > 1:
        #   strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
        # else:
        #   strategy = tf.compat.v2.distribute.MirroredStrategy()

        # with strategy.scope():
        model_lib_v2.train_loop(
            pipeline_config_path=FLAGS.pipeline_config_path,
            model_dir=FLAGS.model_dir,
            train_steps=FLAGS.num_train_steps,
            use_tpu=FLAGS.use_tpu,
            checkpoint_every_n=FLAGS.checkpoint_every_n,
            record_summaries=FLAGS.record_summaries)
Exemplo n.º 8
0
    def evaluate_model(self, hyper_params: HyperParameterInformation) -> None:
        pipeline_config_path = os.path.join(self.path.model_dir,
                                            'pipeline.config')

        eval_continuously(pipeline_config_path=pipeline_config_path,
                          train_steps=hyper_params.training_steps,
                          model_dir=self.path.model_dir,
                          checkpoint_dir=self.path.model_dir,
                          override_eval_num_epochs=False,
                          wait_interval=180,
                          timeout=3600)
Exemplo n.º 9
0
def main(unused_argv):
    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('pipeline_config_path')
    tf.config.set_soft_device_placement(True)

    if FLAGS.checkpoint_dir:
        if FLAGS.eval_all_checkpoints:
            model_lib_v2.eval_all_checkpoints(
                pipeline_config_path=FLAGS.pipeline_config_path,
                model_dir=FLAGS.model_dir,
                train_steps=FLAGS.num_train_steps,
                sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
                sample_1_of_n_eval_on_train_examples=(
                    FLAGS.sample_1_of_n_eval_on_train_examples),
                checkpoint_dir=FLAGS.checkpoint_dir)
        else:
            model_lib_v2.eval_continuously(
                pipeline_config_path=FLAGS.pipeline_config_path,
                model_dir=FLAGS.model_dir,
                train_steps=FLAGS.num_train_steps,
                sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
                sample_1_of_n_eval_on_train_examples=(
                    FLAGS.sample_1_of_n_eval_on_train_examples),
                checkpoint_dir=FLAGS.checkpoint_dir,
                wait_interval=300,
                timeout=FLAGS.eval_timeout)
    else:
        if FLAGS.use_tpu:
            # TPU is automatically inferred if tpu_name is None and
            # we are running under cloud ai-platform.
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name)
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            strategy = tf.distribute.experimental.TPUStrategy(resolver)
        elif FLAGS.num_workers > 1:
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
        else:
            strategy = tf.compat.v2.distribute.MirroredStrategy()

        with strategy.scope():
            model_lib_v2.train_loop(
                pipeline_config_path=FLAGS.pipeline_config_path,
                model_dir=FLAGS.model_dir,
                train_steps=FLAGS.num_train_steps,
                use_tpu=FLAGS.use_tpu,
                checkpoint_every_n=FLAGS.checkpoint_every_n,
                record_summaries=FLAGS.record_summaries)
Exemplo n.º 10
0
def main(_):
    with open('system_dict.json') as json_file:
        args = json.load(json_file)

    tf.config.set_soft_device_placement(True)

    if args["checkpoint_dir"]:
        model_lib_v2.eval_continuously(
            pipeline_config_path=args["pipeline_config_path"],
            model_dir=args["model_dir"],
            train_steps=args["num_train_steps"],
            sample_1_of_n_eval_examples=args["sample_1_of_n_eval_examples"],
            sample_1_of_n_eval_on_train_examples=(
                args["sample_1_of_n_eval_on_train_examples"]),
            checkpoint_dir=args["checkpoint_dir"],
            wait_interval=300,
            timeout=args["eval_timeout"])

    else:
        if args["use_tpu"]:
            # TPU is automatically inferred if tpu_name is None and
            # we are running under cloud ai-platform.
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                args["tpu_name"])
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            strategy = tf.distribute.experimental.TPUStrategy(resolver)
        elif args["num_workers"] > 1:
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
        else:
            strategy = tf.compat.v2.distribute.MirroredStrategy()

        with strategy.scope():
            model_lib_v2.train_loop(
                pipeline_config_path=args["pipeline_config_path"],
                model_dir=args["model_dir"],
                train_steps=args["num_train_steps"],
                use_tpu=args["use_tpu"],
                checkpoint_every_n=args["checkpoint_every_n"],
                record_summaries=args["record_summaries"])
def main(unused_argv):
    if FLAGS.checkpoint_dir:
        print("\n-------Running evaluation")
    else:
        print("\n-------Running traingin!")
    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('pipeline_config_path')
    tf.config.set_soft_device_placement(True)

    print(
        "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )

    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus),
                  "Logical GPUs")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(e)

    print(
        "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )

    if FLAGS.checkpoint_dir:
        model_lib_v2.eval_continuously(
            pipeline_config_path=FLAGS.pipeline_config_path,
            model_dir=FLAGS.model_dir,
            train_steps=FLAGS.num_train_steps,
            sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
            sample_1_of_n_eval_on_train_examples=(
                FLAGS.sample_1_of_n_eval_on_train_examples),
            checkpoint_dir=FLAGS.checkpoint_dir,
            wait_interval=300,
            timeout=FLAGS.eval_timeout)
    else:
        if FLAGS.use_tpu:
            # TPU is automatically inferred if tpu_name is None and
            # we are running under cloud ai-platform.
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name)
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            strategy = tf.distribute.experimental.TPUStrategy(resolver)
        elif FLAGS.num_workers > 1:
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
        else:
            strategy = tf.compat.v2.distribute.MirroredStrategy()

        with strategy.scope():
            model_lib_v2.train_loop(
                pipeline_config_path=FLAGS.pipeline_config_path,
                model_dir=FLAGS.model_dir,
                save_final_config=True,
                train_steps=FLAGS.num_train_steps,
                use_tpu=FLAGS.use_tpu,
                checkpoint_every_n=FLAGS.checkpoint_every_n,
                record_summaries=FLAGS.record_summaries)