예제 #1
0
def main(unused_argv):
    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('pipeline_config_path')

    tpu_cluster_resolver = (
        tf.contrib.cluster_resolver.python.training.TPUClusterResolver(
            tpu_names=[FLAGS.tpu_name],
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project))
    tpu_grpc_url = tpu_cluster_resolver.get_master()

    config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=FLAGS.model_dir,
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_shards))

    train_and_eval_dict = model_lib.create_estimator_and_inputs(
        run_config=config,
        hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
        pipeline_config_path=FLAGS.pipeline_config_path,
        train_steps=FLAGS.num_train_steps,
        eval_steps=FLAGS.num_eval_steps,
        use_tpu_estimator=True,
        use_tpu=FLAGS.use_tpu,
        num_shards=FLAGS.num_shards,
        batch_size=FLAGS.train_batch_size)
    estimator = train_and_eval_dict['estimator']
    train_input_fn = train_and_eval_dict['train_input_fn']
    eval_input_fn = train_and_eval_dict['eval_input_fn']
    eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
    train_steps = train_and_eval_dict['train_steps']
    eval_steps = train_and_eval_dict['eval_steps']

    if FLAGS.mode == 'train':
        estimator.train(input_fn=train_input_fn, max_steps=train_steps)

    # Continuously evaluating.
    if FLAGS.mode == 'eval':
        if FLAGS.eval_training_data:
            name = 'training_data'
            input_fn = eval_on_train_input_fn
        else:
            name = 'validation_data'
            input_fn = eval_input_fn
        model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn,
                                  eval_steps, train_steps, name)
예제 #2
0
def main(argv):
    del argv  # Unused.

    params = params_dict.ParamsDict(retinanet_config.RETINANET_CFG,
                                    retinanet_config.RETINANET_RESTRICTIONS)
    params = params_dict.override_params_dict(params,
                                              FLAGS.params_override,
                                              is_strict=True)
    params.validate()
    params.lock()

    model_params = dict(params.as_dict(),
                        use_tpu=FLAGS.use_tpu,
                        mode=tf.estimator.ModeKeys.PREDICT,
                        transpose_input=False)

    print(' - Setting up TPUEstimator...')
    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=serving.serving_model_fn_builder(
            FLAGS.use_tpu, FLAGS.output_image_info,
            FLAGS.output_normalized_coordinates,
            FLAGS.cast_num_detections_to_float),
        model_dir=None,
        config=tpu_config.RunConfig(
            tpu_config=tpu_config.TPUConfig(iterations_per_loop=1),
            master='local',
            evaluation_master='local'),
        params=model_params,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.batch_size,
        predict_batch_size=FLAGS.batch_size,
        export_to_tpu=FLAGS.use_tpu,
        export_to_cpu=True)

    print(' - Exporting the model...')
    input_type = FLAGS.input_type
    image_size = [int(x) for x in FLAGS.input_image_size.split(',')]
    export_path = estimator.export_saved_model(
        export_dir_base=FLAGS.export_dir,
        serving_input_receiver_fn=functools.partial(
            serving.serving_input_fn,
            batch_size=FLAGS.batch_size,
            desired_image_size=image_size,
            stride=(2**params.anchor.max_level),
            input_type=input_type,
            input_name=FLAGS.input_name),
        checkpoint_path=FLAGS.checkpoint_path)

    print(' - Done! path: %s' % export_path)
예제 #3
0
def main(_):
  config = mask_rcnn_params.default_config()
  config = params_io.override_hparams(config, FLAGS.config)
  config.is_training_bn = False
  config.train_batch_size = FLAGS.batch_size
  config.eval_batch_size = FLAGS.batch_size

  model_params = dict(
      config.values(),
      use_tpu=FLAGS.use_tpu,
      mode=tf.estimator.ModeKeys.PREDICT,
      transpose_input=False)

  print(' - Setting up TPUEstimator...')
  estimator = tf.contrib.tpu.TPUEstimator(
      model_fn=mask_rcnn_model.mask_rcnn_model_fn,
      model_dir=FLAGS.model_dir,
      config=tpu_config.RunConfig(
          tpu_config=tpu_config.TPUConfig(
              iterations_per_loop=FLAGS.iterations_per_loop),
          master='local',
          evaluation_master='local'),
      params=model_params,
      use_tpu=FLAGS.use_tpu,
      train_batch_size=FLAGS.batch_size,
      predict_batch_size=FLAGS.batch_size,
      export_to_tpu=FLAGS.use_tpu,
      export_to_cpu=True,
      experimental_exported_model_uses_all_cores=FLAGS.inference_with_all_cores)

  print(' - Exporting the model...')
  input_type = FLAGS.input_type
  export_path = estimator.export_saved_model(
      export_dir_base=FLAGS.export_dir,
      serving_input_receiver_fn=functools.partial(
          serving_inputs.serving_input_fn,
          batch_size=FLAGS.batch_size,
          desired_image_size=config.image_size,
          padding_stride=(2**config.max_level),
          input_type=input_type),
      checkpoint_path=FLAGS.checkpoint_path)
  if FLAGS.add_warmup_requests and input_type == 'image_bytes':
    inference_warmup.write_warmup_requests(
        export_path,
        FLAGS.model_name,
        config.image_size,
        batch_sizes=[FLAGS.batch_size],
        image_format='JPEG',
        input_signature=serving_inputs.INPUT_SIGNATURE)
예제 #4
0
def main(unused_argv):
    del unused_argv  # Unused

    if FLAGS.input_layout not in ['NCHW', 'NHWC']:
        raise RuntimeError('--input_layout must be one of [NCHW, NHWC]')

    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        evaluation_master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        save_summary_steps=FLAGS.save_summary_steps,
        session_config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement),
        tpu_config=tpu_config.TPUConfig(iterations_per_loop=FLAGS.iterations,
                                        num_shards=FLAGS.num_shards))

    inception_classifier = tpu_estimator.TPUEstimator(
        model_fn=inception_model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        batch_axis=(get_batch_axis(FLAGS.train_batch_size // FLAGS.num_shards),
                    0))

    for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval):
        # tensors_to_log = {
        #     'learning_rate': 'learning_rate',
        #     'prediction_loss': 'prediction_loss',
        #     'train_accuracy': 'train_accuracy'
        # }

        # logging_hook = tf.train.LoggingTensorHook(
        #     tensors=tensors_to_log, every_n_iter=100)

        tf.logging.info('Starting training cycle %d.' % cycle)
        inception_classifier.train(input_fn=ImageNetInput(True),
                                   steps=FLAGS.train_steps_per_eval)

        if FLAGS.eval_enabled:
            eval_steps = (imagenet.get_split_size('validation') //
                          FLAGS.eval_batch_size)
            tf.logging.info('Starting evaluation cycle %d .' % cycle)
            eval_results = inception_classifier.evaluate(
                input_fn=ImageNetInput(False), steps=eval_steps)
            tf.logging.info('Evaluation results: %s' % eval_results)
예제 #5
0
 def test_no_session_config_overwrite_with_cluster_spec(self):
     tf_config = {
         'cluster': {
             run_config_lib.TaskType.CHIEF: ['host3:3'],
             run_config_lib.TaskType.WORKER: ['host3:4']
         },
         'task': {
             'type': run_config_lib.TaskType.CHIEF,
             'index': 0
         }
     }
     with _set_tf_config_env_variable(tf_config):
         session_config = config_pb2.ConfigProto(allow_soft_placement=True)
         run_config = tpu_config_lib.RunConfig(
             session_config=session_config)
         self.assertEqual(session_config, run_config.session_config)
예제 #6
0
def main(unused_argv):
  tf.logging.set_verbosity(FLAGS.log)

  hparams = HParams(
      batch_size=64,
      rnn_layer_sizes=[64, 64],
      dropout_keep_prob=0.5,
      skip_first_n_losses=0,
      clip_norm=5,
      initial_learning_rate=0.01,
      decay_steps=1000,
      decay_rate=0.95)

  use_fake_data = not FLAGS.sequence_example_file

  if not use_fake_data:
    sequence_example_file_paths = tf.gfile.Glob(
        os.path.expanduser(FLAGS.sequence_example_file))
    tf.logging.info('Using real data from : %s', sequence_example_file_paths)

    input_fn = input_fn_by_record_files(
        sequence_example_file_paths, _INPUT_SIZE,
        padding_length=(
            FLAGS.static_padding_length if FLAGS.use_static_rnn else None))
  else:
    tf.logging.info('Using fake data')
    input_fn = input_fn_by_dataset_with_fake_data(
        _INPUT_SIZE,
        padding_length=(
            FLAGS.static_padding_length if FLAGS.use_static_rnn else None))

  run_config = tpu_config.RunConfig(
      master=FLAGS.master,
      model_dir=FLAGS.model_dir,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=True),
      tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards),
  )

  model_fn = events_rnn_graph.build_model_fn(hparams)

  estimator = tpu_estimator.TPUEstimator(
      model_fn=model_fn,
      config=run_config,
      train_batch_size=hparams.batch_size,
      use_tpu=FLAGS.use_tpu)
  estimator.train(input_fn=input_fn, max_steps=FLAGS.num_training_steps)
예제 #7
0
def main(unused_argv):
    del unused_argv  # Unused

    tf.logging.set_verbosity(tf.logging.INFO)

    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards))
    estimator = tpu_estimator.TPUEstimator(model_fn=model_fn,
                                           use_tpu=FLAGS.use_tpu,
                                           config=run_config,
                                           train_batch_size=FLAGS.batch_size)
    estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
예제 #8
0
def train(working_dir, *tf_records, steps=None):
    if FLAGS.use_tpu:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=None, project=None)

        config = tpu_config.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=working_dir,
            save_checkpoints_steps=max(600, FLAGS.iterations_per_loop),
            tpu_config=tpu_config.TPUConfig(
                iterations_per_loop=FLAGS.iterations_per_loop,
                num_shards=FLAGS.num_tpu_cores,
                per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2))  # pylint: disable=line-too-long

        estimator = tpu_estimator.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.train_batch_size)

        def input_fn(params):
            return preprocessing.get_tpu_input_tensors(params['batch_size'],
                                                       tf_records)

        # TODO: get hooks working again with TPUestimator.
        hooks = []
    else:
        estimator = get_estimator(working_dir)

        def input_fn():
            return preprocessing.get_input_tensors(
                FLAGS.train_batch_size,
                tf_records,
                filter_amount=1.0,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size)

        hooks = [
            UpdateRatioSessionHook(working_dir),
            EchoStepCounterHook(output_dir=working_dir)
        ]

    if steps is None:
        steps = EXAMPLES_PER_GENERATION // FLAGS.train_batch_size
    print("Training, steps = {}".format(steps))
    estimator.train(input_fn, steps=int(steps), hooks=hooks)
예제 #9
0
def main(unused_argv):
    del unused_argv  # Unused

    tf.logging.set_verbosity(tf.logging.INFO)

    if not FLAGS.save_checkpoints_secs:
        if not FLAGS.eval_steps:
            tf.logging.info(
                "If checkpoint is expected, please set --save_checkpoints_secs."
            )
        else:
            tf.logging.fatal(
                "Flag --save_checkpoints_secs must be set for evaluation. Aborting."
            )

    if not FLAGS.train_file:
        tf.logging.fatal(
            "Flag --train_file must be set for training. Aborting.")

    if FLAGS.eval_steps and not FLAGS.eval_file:
        tf.logging.fatal(
            "Flag --eval_file must be set for evaluation. Aborting.")

    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        evaluation_master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards),
    )

    estimator = tpu_estimator.TPUEstimator(model_fn=model_fn,
                                           use_tpu=FLAGS.use_tpu,
                                           train_batch_size=FLAGS.batch_size,
                                           eval_batch_size=FLAGS.batch_size,
                                           config=run_config)

    estimator.train(input_fn=get_input_fn(FLAGS.train_file),
                    max_steps=FLAGS.train_steps)

    if FLAGS.eval_steps:
        estimator.evaluate(input_fn=get_input_fn(FLAGS.eval_file),
                           steps=FLAGS.eval_steps)
예제 #10
0
  def _build_estimator(self, is_training):
    """Returns an Estimator object.

    Args:
      is_training: Boolean, whether or not we're in training mode.

    Returns:
      A tf.estimator.Estimator.
    """
    config = self._config
    save_checkpoints_steps = config.logging.checkpoint.save_checkpoints_steps
    keep_checkpoint_max = self._config.logging.checkpoint.num_to_keep
    if is_training and config.use_tpu:
      iterations = config.tpu.iterations
      num_shards = config.tpu.num_shards
      run_config = tpu_config.RunConfig(
          save_checkpoints_secs=None,
          save_checkpoints_steps=save_checkpoints_steps,
          keep_checkpoint_max=keep_checkpoint_max,
          master=FLAGS.master,
          evaluation_master=FLAGS.master,
          model_dir=self._logdir,
          tpu_config=tpu_config.TPUConfig(
              iterations_per_loop=iterations,
              num_shards=num_shards,
              per_host_input_for_training=num_shards <= 8),
          tf_random_seed=FLAGS.tf_random_seed)

      batch_size = config.data.batch_size
      return tpu_estimator.TPUEstimator(
          model_fn=self._get_model_fn(),
          config=run_config,
          use_tpu=True,
          train_batch_size=batch_size,
          eval_batch_size=batch_size)
    else:
      run_config = tf.estimator.RunConfig().replace(
          model_dir=self._logdir,
          save_checkpoints_steps=save_checkpoints_steps,
          keep_checkpoint_max=keep_checkpoint_max,
          tf_random_seed=FLAGS.tf_random_seed)
      return tf.estimator.Estimator(
          model_fn=self._get_model_fn(),
          config=run_config)
예제 #11
0
    def test_create_tpu_estimator_and_inputs(self):
        """Tests that number of train/eval defaults to config values."""

        run_config = tpu_config.RunConfig()
        hparams = model_hparams.create_hparams(
            hparams_overrides='load_pretrained=false')
        pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
        train_steps = 20
        train_and_eval_dict = model_lib.create_estimator_and_inputs(
            run_config,
            hparams,
            pipeline_config_path,
            train_steps=train_steps,
            use_tpu_estimator=True)
        estimator = train_and_eval_dict['estimator']
        train_steps = train_and_eval_dict['train_steps']

        self.assertIsInstance(estimator, tpu_estimator.TPUEstimator)
        self.assertEqual(20, train_steps)
예제 #12
0
def main(argv):
  del argv  # Unused.

  run_config = tpu_config.RunConfig(
      master=FLAGS.master,
      model_dir=FLAGS.model_dir,
      save_checkpoints_secs=3600,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=True),
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards),
  )

  estimator = tpu_estimator.TPUEstimator(
      model_fn=model_fn,
      use_tpu=FLAGS.use_tpu,
      config=run_config,
      train_batch_size=FLAGS.batch_size)
  estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
예제 #13
0
def build_run_config():
    """Return RunConfig for TPU estimator."""
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    eval_steps = model_lib.NUM_EVAL_IMAGES // FLAGS.eval_batch_size
    iterations_per_loop = (eval_steps if FLAGS.mode == 'eval' else
                           FLAGS.iterations_per_loop)
    save_checkpoints_steps = FLAGS.save_checkpoints_steps or iterations_per_loop
    run_config = tpu_config.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        keep_checkpoint_max=None,
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_shards,
            per_host_input_for_training=tpu_config.InputPipelineConfig.
            PER_HOST_V2))
    return run_config
예제 #14
0
def run_toy_model_tpu():
    """Run a toy model on TPU."""
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    iterations_per_loop = FLAGS.iterations
    mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
    config = tpu_config.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=None,  # Disable the default saver
        save_checkpoints_secs=None,  # Disable the default saver
        log_step_count_steps=iterations_per_loop,
        save_summary_steps=iterations_per_loop,
        tpu_config=tpu_config.TPUConfig(
            num_shards=mesh_shape.size,
            iterations_per_loop=iterations_per_loop,
            num_cores_per_replica=1,
            per_host_input_for_training=tpu_config.InputPipelineConfig.
            BROADCAST))
    classifier = tpu_estimator.TPUEstimator(use_tpu=True,
                                            model_fn=model_fn,
                                            config=config,
                                            train_batch_size=FLAGS.batch_size,
                                            eval_batch_size=FLAGS.batch_size)
    current_step = estimator_lib._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
    logging.info('Current step %d', current_step)
    if FLAGS.steps_per_checkpoint == 0:
        classifier.train(input_fn=ToyModelInput(), max_steps=FLAGS.train_steps)
        return
    while current_step < FLAGS.train_steps:
        next_checkpoint = min(current_step + FLAGS.steps_per_checkpoint,
                              FLAGS.train_steps)
        classifier.train(input_fn=ToyModelInput(), max_steps=next_checkpoint)
        current_step = next_checkpoint
        logging.info('Starting to evaluate.')
        eval_results = classifier.evaluate(
            input_fn=ToyModelInput(), steps=156
        )  # since we have 10000 examples and batch_size = 64 per host
        logging.info('Eval results: %s', eval_results)
예제 #15
0
def main(argv):
  del argv
  training_examples = (FLAGS.train_epochs * 40000)
  eval_examples = 10000
  iterations_per_loop = ((training_examples // 10) // FLAGS.train_batch_size)

  run_config = tpu_config.RunConfig(
      master=FLAGS.master,
      model_dir=FLAGS.model_dir,
      save_checkpoints_steps=FLAGS.steps_per_checkpoint,
      log_step_count_steps=iterations_per_loop,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=True),
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=iterations_per_loop,
          num_shards=FLAGS.num_shards,
      ),
  )

  estimator = tpu_estimator.TPUEstimator(
      model_fn=model_fn,
      use_tpu=FLAGS.use_tpu,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      params=dict(CIFAR_SMALL_PARAMS, use_tpu=FLAGS.use_tpu),
  )

  # Evaluate the test set after 5% of training examples are finished.
  for cycle in range(10):
    tf.logging.info("Starting %d train steps" %
                    (training_examples // 10 // FLAGS.train_batch_size))
    estimator.train(
        input_fn=InputReader(FLAGS.train_file, is_training=True),
        steps=training_examples // 10 // FLAGS.train_batch_size)

    tf.logging.info("Starting evaluation cycle %d ." % cycle)
    print(estimator.evaluate(
        input_fn=InputReader(FLAGS.train_file, is_training=False),
        steps=eval_examples // FLAGS.eval_batch_size,
    ))
예제 #16
0
  def _test_warm_start(self, warm_start_from=None):
    """Tests whether WarmStartSettings work as intended."""
    def generator_with_new_variable(noise_dict, mode):
      variable_scope.get_variable(name=self.new_variable_name,
                                  initializer=self.new_variable_value,
                                  trainable=True)
      return generator_fn(noise_dict, mode)

    est = estimator.TPUGANEstimator(
        generator_fn=generator_fn,
        discriminator_fn=discriminator_fn,
        generator_loss_fn=losses.wasserstein_generator_loss,
        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
        generator_optimizer=training.GradientDescentOptimizer(1.0),
        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
        train_batch_size=4,
        use_tpu=FLAGS.use_tpu,
        config=self._config)

    def train_input_fn(params):
      data = np.zeros([params['batch_size'], 4], dtype=np.float32)
      return data, data

    est.train(train_input_fn, steps=1)

    est_warm = estimator.TPUGANEstimator(
        generator_fn=generator_with_new_variable,
        discriminator_fn=discriminator_fn,
        generator_loss_fn=losses.wasserstein_generator_loss,
        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
        generator_optimizer=training.GradientDescentOptimizer(1.0),
        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
        config=tpu_config.RunConfig(
            model_dir=None if warm_start_from else self._model_dir),
        train_batch_size=4,
        use_tpu=FLAGS.use_tpu,
        warm_start_from=warm_start_from)

    est_warm.train(train_input_fn, steps=1)

    return est_warm
예제 #17
0
    def testTrainingPipeline(self, training_method):
        output_directory = '/tmp/'

        g = tf.Graph()
        with g.as_default():

            dataset = self._retrieve_data(is_training=False, data_dir=False)

            FLAGS.transpose_input = False
            FLAGS.use_tpu = False
            FLAGS.mode = 'train'
            FLAGS.mask_init_method = 'random'
            FLAGS.precision = 'float32'
            FLAGS.train_steps = 1
            FLAGS.train_batch_size = 1
            FLAGS.eval_batch_size = 1
            FLAGS.steps_per_eval = 1
            FLAGS.model_architecture = 'resnet'

            params = {}
            params['output_dir'] = output_directory
            params['training_method'] = training_method
            params['use_tpu'] = False
            set_lr_schedule()

            run_config = tpu_config.RunConfig(master=None,
                                              model_dir=None,
                                              save_checkpoints_steps=1,
                                              tpu_config=tpu_config.TPUConfig(
                                                  iterations_per_loop=1,
                                                  num_shards=1))

            classifier = tpu_estimator.TPUEstimator(
                use_tpu=False,
                model_fn=resnet_model_fn_w_pruning,
                params=params,
                config=run_config,
                train_batch_size=1,
                eval_batch_size=1)

            classifier.train(input_fn=dataset.input_fn, max_steps=1)
예제 #18
0
def create_estimator(master,
                     model_dir,
                     use_tpu,
                     iterations_per_loop,
                     num_shards,
                     model_params,
                     include_features_in_predictions=True,
                     decode_keys=(),
                     train_init_checkpoint=None,
                     train_warmup_steps=10000,
                     save_checkpoints_steps=1000,
                     keep_checkpoint_max=1):
    """Returns an tensorflow estimator."""
    # Define GPU Config for session
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    #                         gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.8))  # % of GPU allocated
    config.gpu_options.allow_growth = False

    # This is the runtime config for tensorflow estimators
    run_config = tpu_config.RunConfig(
        master=master,
        model_dir=model_dir,
        session_config=config,
        tpu_config=tpu_config.TPUConfig(iterations_per_loop),
        save_checkpoints_steps=save_checkpoints_steps,
        keep_checkpoint_max=keep_checkpoint_max)

    return tpu_estimator.TPUEstimator(
        model_fn=_estimator_model_fn(use_tpu, model_params, model_dir,
                                     include_features_in_predictions,
                                     decode_keys, train_init_checkpoint,
                                     train_warmup_steps),
        use_tpu=use_tpu,  # false
        train_batch_size=model_params.batch_size *
        num_shards,  # batch_size * 1 by default
        eval_batch_size=model_params.batch_size *
        num_shards,  # batch_size * 1 by default
        predict_batch_size=model_params.batch_size *
        num_shards,  # batch_size * 1 by default
        config=run_config)
예제 #19
0
def main(unused_argv):
    config = tpu_config.RunConfig(master=FLAGS.master,
                                  evaluation_master=FLAGS.master,
                                  model_dir=FLAGS.model_dir,
                                  tpu_config=tpu_config.TPUConfig(
                                      iterations_per_loop=100, num_shards=8))
    resnet_classifier = tpu_estimator.TPUEstimator(
        model_fn=resnet_model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    for cycle in range(FLAGS.train_steps // FLAGS.steps_per_eval):
        tf.logging.info('Starting a training cycle.')
        resnet_classifier.train(input_fn=ImageNetInput(True),
                                steps=FLAGS.steps_per_eval)

        _EVAL_STEPS = 50000 // FLAGS.eval_batch_size
        tf.logging.info('Starting to evaluate.')
        eval_results = resnet_classifier.evaluate(
            input_fn=ImageNetInput(False), steps=_EVAL_STEPS)
        tf.logging.info('Eval results: %s' % eval_results)
예제 #20
0
def main(unused_argv):
    assert len(unused_argv) == 1, ("Unrecognized command line arguments: %s" %
                                   unused_argv[1:])

    start = time.time()
    tf.logging.set_verbosity(tf.logging.INFO)

    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        #tpu_config=tpu_config.TPUConfig(5, FLAGS.num_shards, per_host_input_for_training = True),
        tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards),
    )
    estimator = tpu_estimator.TPUEstimator(model_fn=model_fn,
                                           use_tpu=FLAGS.use_tpu,
                                           train_batch_size=64,
                                           config=run_config)
    estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
    total = time.time() - start
    print("Total time: " + str(total))
예제 #21
0
def main(argv):
    del argv
    training_examples = FLAGS.train_epochs * 40000
    eval_examples = 10000

    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=training_examples // 10 // FLAGS.batch_size,
            num_shards=FLAGS.num_shards,
        ),
    )

    estimator = tpu_estimator.TPUEstimator(
        model_fn=model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        params=dict(CIFAR_SMALL_PARAMS, use_tpu=FLAGS.use_tpu),
    )

    # Evaluate the test set after 10% of training examples are finished.
    for _ in range(10):
        estimator.train(input_fn=InputReader(FLAGS.train_file,
                                             is_training=True),
                        steps=training_examples // 10)

        print(
            estimator.evaluate(
                input_fn=InputReader(FLAGS.train_file, is_training=False),
                steps=eval_examples,
            ))
예제 #22
0
파일: dual_net.py 프로젝트: flexpad/minigo
def get_tpu_estimator(working_dir):
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=None, project=None)
    tpu_grpc_url = tpu_cluster_resolver.get_master()

    run_config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=working_dir,
        save_checkpoints_steps=max(1000, FLAGS.iterations_per_loop),
        save_summary_steps=FLAGS.summary_steps,
        session_config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=True),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2))

    return tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores,
        eval_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores)
예제 #23
0
def main(unused_argv):
    if FLAGS.use_tpu:
        if FLAGS.master is None and FLAGS.tpu_name is None:
            raise RuntimeError(
                "You must specify either --master or --tpu_name.")

        if FLAGS.master is not None:
            if FLAGS.tpu_name is not None:
                tf.logging.warn(
                    "Both --master and --tpu_name are set. Ignoring "
                    "--tpu_name and using --master.")
            tpu_grpc_url = FLAGS.master
        else:
            tpu_cluster_resolver = (
                tf.contrib.cluster_resolver.TPUClusterResolver(
                    FLAGS.tpu_name,
                    zone=FLAGS.tpu_zone,
                    project=FLAGS.gcp_project))
            tpu_grpc_url = tpu_cluster_resolver.get_master()
    else:
        # URL is unused if running locally without TPU
        tpu_grpc_url = None

    batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size
    steps_per_checkpoint = FLAGS.steps_per_checkpoint
    iterations_per_loop = FLAGS.iterations_per_loop
    eval_steps = _NUM_EVAL_IMAGES // FLAGS.eval_batch_size
    if iterations_per_loop is None or steps_per_checkpoint < iterations_per_loop:
        iterations_per_loop = steps_per_checkpoint
    if FLAGS.mode == "eval":
        iterations_per_loop = eval_steps
    params = {
        "batches_per_epoch": batches_per_epoch,
    }

    config = tpu_config.RunConfig(master=tpu_grpc_url,
                                  evaluation_master=tpu_grpc_url,
                                  model_dir=FLAGS.model_dir,
                                  save_checkpoints_steps=steps_per_checkpoint,
                                  log_step_count_steps=iterations_per_loop,
                                  tpu_config=tpu_config.TPUConfig(
                                      iterations_per_loop=iterations_per_loop,
                                      num_shards=FLAGS.num_shards))

    densenet_estimator = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        params=params)

    if FLAGS.mode == "train":
        tf.logging.info(
            "Training for %d steps (%.2f epochs in total)." %
            (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch))
        densenet_estimator.train(input_fn=ImageNetInput(True),
                                 max_steps=FLAGS.train_steps)

    elif FLAGS.mode == "train_and_eval":
        current_step = 0
        tf.logging.info(
            "Training for %d steps (%.2f epochs in total). Current "
            "step %d" % (FLAGS.train_steps,
                         FLAGS.train_steps / batches_per_epoch, current_step))
        while current_step < FLAGS.train_steps:
            next_checkpoint = min(current_step + steps_per_checkpoint,
                                  FLAGS.train_steps)
            num_steps = next_checkpoint - current_step
            current_step = next_checkpoint
            densenet_estimator.train(input_fn=ImageNetInput(True),
                                     steps=num_steps)

            tf.logging.info("Starting to evaluate.")
            eval_results = densenet_estimator.evaluate(
                input_fn=ImageNetInput(False),
                steps=_NUM_EVAL_IMAGES // FLAGS.eval_batch_size)
            tf.logging.info("Eval results: %s" % eval_results)

    else:

        def terminate_eval():
            tf.logging.info(
                "Terminating eval after %d seconds of no checkpoints" %
                FLAGS.eval_timeout)
            return True

        # Run evaluation when there"s a new checkpoint
        # If the evaluation worker is delayed in processing a new checkpoint,
        # the checkpoint file may be deleted by the trainer before it can be
        # evaluated.
        # Ignore the error in this case.
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval):

            tf.logging.info("Starting to evaluate.")
            try:
                eval_results = densenet_estimator.evaluate(
                    input_fn=ImageNetInput(False),
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                tf.logging.info("Eval results: %s" % eval_results)
            except tf.errors.NotFoundError:
                tf.logging.info(
                    "Checkpoint %s no longer exists, skipping checkpoint")
예제 #24
0
def main(argv):
    del argv

    if FLAGS.use_tpu:
        if FLAGS.master is None and FLAGS.tpu_name is None:
            raise RuntimeError(
                'You must specify either --master or --tpu_name.')

        if FLAGS.master is not None:
            if FLAGS.tpu_name is not None:
                tf.logging.warn(
                    'Both --master and --tpu_name are set. Ignoring '
                    '--tpu_name and using --master.')
            tpu_grpc_url = FLAGS.master
        else:
            tpu_cluster_resolver = (
                tf.contrib.cluster_resolver.TPUClusterResolver(
                    FLAGS.tpu_name,
                    zone=FLAGS.tpu_zone,
                    project=FLAGS.gcp_project))
            tpu_grpc_url = tpu_cluster_resolver.get_master()
    else:
        tpu_grpc_url = None

    config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=FLAGS.model_dir,
        tpu_config=tpu_config.TPUConfig(
            num_shards=FLAGS.num_shards,
            iterations_per_loop=FLAGS.iterations_per_loop))

    # Set module-level global variable so that model_fn and input_fn can be
    # identical for each different kind of dataset and model
    global dataset, model
    if FLAGS.dataset == 'mnist':
        dataset = mnist_input
        model = mnist_model
    elif FLAGS.dataset == 'cifar':
        dataset = cifar_input
        model = cifar_model
    else:
        raise ValueError('Invalid dataset: %s' % FLAGS.dataset)

    # TPU-based estimator used for TRAIN and EVAL
    est = tpu_estimator.TPUEstimator(model_fn=model_fn,
                                     use_tpu=FLAGS.use_tpu,
                                     config=config,
                                     train_batch_size=FLAGS.batch_size,
                                     eval_batch_size=FLAGS.batch_size)

    # CPU-based estimator used for PREDICT (generating images)
    cpu_est = tpu_estimator.TPUEstimator(model_fn=model_fn,
                                         use_tpu=False,
                                         config=config,
                                         predict_batch_size=_NUM_VIZ_IMAGES)

    tf.gfile.MakeDirs(os.path.join(FLAGS.model_dir, 'generated_images'))

    current_step = estimator._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
    tf.logging.info('Starting training for %d steps, current step: %d' %
                    (FLAGS.train_steps, current_step))
    while current_step < FLAGS.train_steps:
        next_checkpoint = min(current_step + FLAGS.train_steps_per_eval,
                              FLAGS.train_steps)
        est.train(input_fn=generate_input_fn(True), max_steps=next_checkpoint)
        current_step = next_checkpoint
        tf.logging.info('Finished training step %d' % current_step)

        if FLAGS.eval_loss:
            # Evaluate loss on test set
            metrics = est.evaluate(input_fn=generate_input_fn(False),
                                   steps=dataset.NUM_EVAL_IMAGES //
                                   FLAGS.batch_size)
            tf.logging.info('Finished evaluating')
            tf.logging.info(metrics)

        # Render some generated images
        generated_iter = cpu_est.predict(input_fn=noise_input_fn)
        images = [p['generated_images'][:, :, :] for p in generated_iter]
        assert len(images) == _NUM_VIZ_IMAGES
        image_rows = [
            np.concatenate(images[i:i + 10], axis=0)
            for i in range(0, _NUM_VIZ_IMAGES, 10)
        ]
        tiled_image = np.concatenate(image_rows, axis=1)

        img = dataset.convert_array_to_image(tiled_image)

        step_string = str(current_step).zfill(5)
        file_obj = tf.gfile.Open(
            os.path.join(FLAGS.model_dir, 'generated_images',
                         'gen_%s.png' % (step_string)), 'w')
        img.save(file_obj, format='png')
        tf.logging.info('Finished generating images')
예제 #25
0
def main(unused_argv):
    del unused_argv  # Unused

    if FLAGS.master is None and FLAGS.tpu_name is None:
        raise RuntimeError('You must specify either --master or --tpu_name.')

    if FLAGS.master is not None:
        if FLAGS.tpu_name is not None:
            tf.logging.warn('Both --master and --tpu_name are set. Ignoring '
                            '--tpu_name and using --master.')
        tpu_grpc_url = FLAGS.master
    else:
        tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project))
        tpu_grpc_url = tpu_cluster_resolver.get_master()

    batch_size_per_shard = FLAGS.train_batch_size // FLAGS.num_shards
    params = {
        'input_perm': [0, 1, 2, 3],
        'output_perm': [0, 1, 2, 3],
    }

    batch_axis = 0
    if FLAGS.transpose_enabled:
        if batch_size_per_shard >= 64:
            params['input_perm'] = [3, 0, 1, 2]
            params['output_perm'] = [1, 2, 3, 0]
            batch_axis = 3
        else:
            params['input_perm'] = [2, 0, 1, 3]
            params['output_perm'] = [1, 2, 0, 3]
            batch_axis = 2

    if FLAGS.eval_total_size > 0:
        eval_size = FLAGS.eval_total_size
    else:
        eval_size = _NUM_EVAL_IMAGES
    eval_steps = eval_size // FLAGS.eval_batch_size

    iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations)

    eval_batch_size = (None
                       if FLAGS.mode == 'train' else FLAGS.eval_batch_size)

    per_host_input_for_training = (FLAGS.num_shards <= 8
                                   if FLAGS.mode == 'train' else True)

    run_config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        save_summary_steps=FLAGS.save_summary_steps,
        session_config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=iterations,
            num_shards=FLAGS.num_shards,
            per_host_input_for_training=per_host_input_for_training))

    inception_classifier = tpu_estimator.TPUEstimator(
        model_fn=model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        params=params,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=eval_batch_size,
        batch_axis=(batch_axis, 0))

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train = InputPipeline(is_training=True, data_dir=FLAGS.data_dir)
    imagenet_eval = InputPipeline(is_training=False, data_dir=FLAGS.data_dir)

    if FLAGS.moving_average:
        eval_hooks = [LoadEMAHook(FLAGS.model_dir)]
    else:
        eval_hooks = []

    if FLAGS.mode == 'eval':

        def terminate_eval():
            tf.logging.info('%d seconds without new checkpoints have elapsed '
                            '... terminating eval' % FLAGS.eval_timeout)
            return True

        def get_next_checkpoint():
            return evaluation.checkpoints_iterator(
                FLAGS.model_dir,
                min_interval_secs=FLAGS.min_eval_interval,
                timeout=FLAGS.eval_timeout,
                timeout_fn=terminate_eval)

        for checkpoint in get_next_checkpoint():
            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = inception_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    hooks=eval_hooks,
                    checkpoint_path=checkpoint)
                tf.logging.info('Evaluation results: %s' % eval_results)
            except tf.errors.NotFoundError:
                # skip checkpoint if it gets deleted prior to evaluation
                tf.logging.info('Checkpoint %s no longer exists ... skipping')

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval):
            tf.logging.info('Starting training cycle %d.' % cycle)
            inception_classifier.train(input_fn=imagenet_train.input_fn,
                                       steps=FLAGS.train_steps_per_eval)

            tf.logging.info('Starting evaluation cycle %d .' % cycle)
            eval_results = inception_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)

    else:
        tf.logging.info('Starting training ...')
        inception_classifier.train(input_fn=imagenet_train.input_fn,
                                   steps=FLAGS.train_steps)
예제 #26
0
def main(unused_argv):
    del unused_argv  # Unused

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', (
        'Invalid value for --precision flag; must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', FLAGS.precision)

    params = {
        'input_perm': [0, 1, 2, 3],
        'output_perm': [0, 1, 2, 3],
    }

    batch_axis = 0
    if FLAGS.transpose_enabled:
        params['input_perm'] = [3, 0, 1, 2]
        params['output_perm'] = [1, 2, 3, 0]
        batch_axis = 3

    if FLAGS.eval_total_size > 0:
        eval_size = FLAGS.eval_total_size
    else:
        eval_size = _NUM_EVAL_IMAGES
    eval_steps = eval_size // FLAGS.eval_batch_size

    iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations)

    eval_batch_size = (None
                       if FLAGS.mode == 'train' else FLAGS.eval_batch_size)

    per_host_input_for_training = (FLAGS.num_shards <= 8
                                   if FLAGS.mode == 'train' else True)

    run_config = tpu_config.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        save_summary_steps=FLAGS.save_summary_steps,
        session_config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=iterations,
            num_shards=FLAGS.num_shards,
            per_host_input_for_training=per_host_input_for_training))

    inception_classifier = tpu_estimator.TPUEstimator(
        model_fn=inception_model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        params=params,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=eval_batch_size,
        batch_axis=(batch_axis, 0))

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    use_bfloat16 = FLAGS.precision == 'bfloat16'
    imagenet_train = InputPipeline(is_training=True,
                                   data_dir=FLAGS.data_dir,
                                   use_bfloat16=use_bfloat16)
    imagenet_eval = InputPipeline(is_training=False,
                                  data_dir=FLAGS.data_dir,
                                  use_bfloat16=use_bfloat16)

    if FLAGS.moving_average:
        eval_hooks = [LoadEMAHook(FLAGS.model_dir)]
    else:
        eval_hooks = []

    if FLAGS.mode == 'eval':
        # Run evaluation when there is a new checkpoint
        for checkpoint in evaluation.checkpoints_iterator(FLAGS.model_dir):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time()  # Includes compilation time
                eval_results = inception_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    hooks=eval_hooks,
                    checkpoint_path=checkpoint)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(checkpoint).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break
            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    checkpoint)

    elif FLAGS.mode == 'train_and_eval':
        for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval):
            tf.logging.info('Starting training cycle %d.' % cycle)
            inception_classifier.train(input_fn=imagenet_train.input_fn,
                                       steps=FLAGS.train_steps_per_eval)

            tf.logging.info('Starting evaluation cycle %d .' % cycle)
            eval_results = inception_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)

    else:
        tf.logging.info('Starting training ...')
        inception_classifier.train(input_fn=imagenet_train.input_fn,
                                   max_steps=FLAGS.train_steps)
예제 #27
0
def main(unused_argv):
  tpu_grpc_url = None
  tpu_cluster_resolver = None
  if FLAGS.use_tpu:
    # Determine the gRPC URL of the TPU device to use
    #if not FLAGS.master and not FLAGS.tpu_name:
    #  raise RuntimeError('You must specify either --master or --tpu_name.')

    if FLAGS.master:
      if FLAGS.tpu_name:
        tf.logging.warn('Both --master and --tpu_name are set. Ignoring'
                        ' --tpu_name and using --master.')
      tpu_grpc_url = FLAGS.master
    else:
      tpu_cluster_resolver = (
          tf.contrib.cluster_resolver.TPUClusterResolver(
              FLAGS.tpu_name,
              zone=FLAGS.tpu_zone,
              project=FLAGS.gcp_project))
  else:
    # URL is unused if running locally without TPU
    tpu_grpc_url = None
 
  config = tpu_config.RunConfig(
      master=tpu_grpc_url,
      evaluation_master=tpu_grpc_url,
      model_dir=FLAGS.model_dir,
      cluster=tpu_cluster_resolver,
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_cores))
    
  # Prepare training and testing data
  dbpedia = tf.contrib.learn.datasets.load_dataset(
      'dbpedia', size='large', test_with_fake_data=FLAGS.test_with_fake_data)
  
  print("Shuffling data set...")
  x_train = dbpedia.train.data[:, 1]
  y_train = dbpedia.train.target
  s = np.arange(len(y_train))
  np.random.shuffle(s)
  x_train = x_train[s]
  y_train = y_train[s]
  print("Done!")  
  
  x_train = pandas.Series(x_train)
  y_train = pandas.Series(y_train)
  x_test = pandas.Series(dbpedia.test.data[:, 1])
  y_test = pandas.Series(dbpedia.test.target)

  print('Train data size:', x_train.shape)
  print('Test data size:', x_test.shape)

  # Process vocabulary
  char_processor = tf.contrib.learn.preprocessing.ByteProcessor(
      MAX_DOCUMENT_LENGTH)
  x_train = np.array(list(char_processor.fit_transform(x_train)))
  x_test = np.array(list(char_processor.transform(x_test)))

  # Build model
  #classifier = tf.estimator.Estimator(model_fn=char_rnn_model)
  classifier = tpu_estimator.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=char_rnn_model,
      config=config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size)    


  def TPU_train_input_fn(params):
      return tf.estimator.inputs.numpy_input_fn(
          x={CHARS_FEATURE: x_train},
          y=y_train,
          batch_size=params['batch_size'],
          num_epochs=None,
          shuffle=True)()

  def TPU_test_input_fn(params):
      return tf.estimator.inputs.numpy_input_fn(
          x={CHARS_FEATURE: x_test},
          y=y_test,
          batch_size=params['batch_size'],
          num_epochs=1,
          shuffle=False)()

  # Train.
  current_step = 0
  while current_step < FLAGS.train_steps:
    # Train for up to steps_per_eval number of steps.
    # At the end of training, a checkpoint will be written to --model_dir.
    next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                      FLAGS.train_steps)  

    classifier.train(
        input_fn=TPU_train_input_fn, max_steps=next_checkpoint)
    current_step = next_checkpoint
    
    # Eval.
    tf.logging.info('Starting to evaluate.')
    eval_results = classifier.evaluate(
        input_fn=TPU_test_input_fn)
    tf.logging.info('Test eval results: %s' % eval_results)

    eval_results = classifier.evaluate(
        input_fn=TPU_train_input_fn)
    tf.logging.info('Test eval results: %s' % eval_results)
    
  scores = classifier.evaluate(input_fn=TPU_test_input_fn)
  print('Accuracy: {0:f}'.format(scores['accuracy']))
예제 #28
0
def main(argv):
    del argv  # Unused.

    tf.enable_resource_variables()
    tf.set_random_seed(FLAGS.seed)
    set_lr_schedule()
    set_custom_sparsity_map()
    folder_stub = os.path.join(FLAGS.training_method, str(FLAGS.end_sparsity),
                               str(FLAGS.maskupdate_begin_step),
                               str(FLAGS.maskupdate_end_step),
                               str(FLAGS.maskupdate_frequency),
                               str(FLAGS.drop_fraction),
                               str(FLAGS.label_smoothing),
                               str(FLAGS.weight_decay))

    output_dir = FLAGS.output_dir
    if FLAGS.use_folder_stub:
        output_dir = os.path.join(output_dir, folder_stub)

    export_dir = os.path.join(output_dir, 'export_dir')

    # we pass the updated eval and train string to the params dictionary.
    params = {}
    params['output_dir'] = output_dir
    params['training_method'] = FLAGS.training_method
    params['use_tpu'] = FLAGS.use_tpu

    dataset_func = functools.partial(
        imagenet_input.ImageNetInput,
        data_dir=FLAGS.data_directory,
        transpose_input=False,
        num_parallel_calls=FLAGS.num_parallel_calls,
        use_bfloat16=False)
    imagenet_train, imagenet_eval = [
        dataset_func(is_training=is_training) for is_training in [True, False]
    ]

    run_config = tpu_config.RunConfig(
        master=FLAGS.master,
        model_dir=output_dir,
        save_checkpoints_steps=FLAGS.steps_per_checkpoint,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores,
            tpu_job_name=FLAGS.tpu_job_name))

    classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn_w_pruning,
        params=params,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    cpu_classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn_w_pruning,
        params=params,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        export_to_tpu=False,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.num_eval_images % FLAGS.eval_batch_size != 0:
        raise ValueError(
            'eval_batch_size (%d) must evenly divide num_eval_images(%d)!' %
            (FLAGS.eval_batch_size, FLAGS.num_eval_images))

    eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
    if FLAGS.mode == 'eval_once':
        ckpt_path = os.path.join(output_dir, FLAGS.eval_once_ckpt_prefix)
        dataset = imagenet_train if FLAGS.eval_on_train else imagenet_eval
        classifier.evaluate(input_fn=dataset.input_fn,
                            steps=eval_steps,
                            checkpoint_path=ckpt_path,
                            name='{0}'.format(FLAGS.eval_once_ckpt_prefix))
    elif FLAGS.mode == 'eval':
        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(output_dir):
            tf.logging.info('Starting to evaluate.')
            try:
                dataset = imagenet_train if FLAGS.eval_on_train else imagenet_eval
                classifier.evaluate(input_fn=dataset.input_fn,
                                    steps=eval_steps,
                                    checkpoint_path=ckpt,
                                    name='eval')
                # Terminate eval job when final checkpoint is reached
                global_step = int(os.path.basename(ckpt).split('-')[1])
                if global_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        global_step)
                    break

            except tf.errors.NotFoundError:
                logging('Checkpoint no longer exists,skipping checkpoint.')

    else:
        global_step = estimator._load_global_step_from_checkpoint_dir(
            output_dir)
        # Session run hooks to export model for prediction
        export_hook = ExportModelHook(cpu_classifier, export_dir)
        hooks = [export_hook]

        if FLAGS.mode == 'train':
            tf.logging.info('start training...')
            classifier.train(input_fn=imagenet_train.input_fn,
                             hooks=hooks,
                             max_steps=FLAGS.train_steps)
        else:
            assert FLAGS.mode == 'train_and_eval'
            tf.logging.info('start training and eval...')
            while global_step < FLAGS.train_steps:
                next_checkpoint = min(global_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                classifier.train(input_fn=imagenet_train.input_fn,
                                 max_steps=next_checkpoint)
                global_step = next_checkpoint
                logging('Completed training up to step :', global_step)
                classifier.evaluate(input_fn=imagenet_eval.input_fn,
                                    steps=eval_steps)
예제 #29
0
파일: resnet_main.py 프로젝트: vishh/tpu
def main(unused_argv):
    tpu_grpc_url = None
    tpu_cluster_resolver = None
    if FLAGS.use_tpu:
        # Determine the gRPC URL of the TPU device to use
        if not FLAGS.master and not FLAGS.tpu_name:
            raise RuntimeError(
                'You must specify either --master or --tpu_name.')

        if FLAGS.master:
            if FLAGS.tpu_name:
                tf.logging.warn(
                    'Both --master and --tpu_name are set. Ignoring'
                    ' --tpu_name and using --master.')
            tpu_grpc_url = FLAGS.master
        else:
            tpu_cluster_resolver = (
                tf.contrib.cluster_resolver.TPUClusterResolver(
                    FLAGS.tpu_name,
                    zone=FLAGS.tpu_zone,
                    project=FLAGS.gcp_project))
    else:
        # URL is unused if running locally without TPU
        tpu_grpc_url = None

    config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        evaluation_master=tpu_grpc_url,
        model_dir=FLAGS.model_dir,
        cluster=tpu_cluster_resolver,
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_cores))

    resnet_classifier = tpu_estimator.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=resnet_model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train = imagenet_input.ImageNetInput(is_training=True,
                                                  data_dir=FLAGS.data_dir)
    imagenet_eval = imagenet_input.ImageNetInput(is_training=False,
                                                 data_dir=FLAGS.data_dir)

    if FLAGS.mode == 'eval':
        eval_steps = NUM_EVAL_IMAGES // FLAGS.eval_batch_size

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(FLAGS.model_dir):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d' %
                                (eval_results, elapsed_time))

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size
        tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                        ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps /
                                       batches_per_epoch, current_step))

        start_timestamp = time.time(
        )  # This time will include compilation time
        if FLAGS.mode == 'train':
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=FLAGS.train_steps)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be consistently excluded modulo the batch size.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size)
                tf.logging.info('Eval results: %s' % eval_results)

        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info(
            'Finished training up to step %d. Elapsed seconds %d.' %
            (FLAGS.train_steps, elapsed_time))

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            resnet_classifier.export_savedmodel(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
예제 #30
0
def main(argv):
  del argv  # Unused.

  if FLAGS.use_tpu:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu,
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)
    tpu_grpc_url = tpu_cluster_resolver.get_master()
    tf.Session.reset(tpu_grpc_url)
  else:
    tpu_cluster_resolver = None

  if FLAGS.mode is 'train' and FLAGS.training_file_pattern is None:
    raise RuntimeError('You must specify --training_file_pattern for training.')
  if FLAGS.mode is 'eval':
    if FLAGS.valid_data_dir is None:
      raise RuntimeError('You must specify --valid_data_dir for evaluation.')
    if FLAGS.val_json_file is None:
      raise RuntimeError('You must specify --val_json_file for evaluation.')

  # Parse hparams
  hparams = retinanet_model.default_hparams()
  hparams.parse(FLAGS.hparams)

  params = dict(
      hparams.values(),
      num_shards=FLAGS.num_shards,
      num_examples_per_epoch=FLAGS.num_examples_per_epoch,
      use_tpu=FLAGS.use_tpu,
      resnet_checkpoint=FLAGS.resnet_checkpoint,
      val_json_file=FLAGS.val_json_file,
      mode=FLAGS.mode,
  )
  config_proto = tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=False)
  if FLAGS.use_xla and not FLAGS.use_tpu:
    config_proto.graph_options.optimizer_options.global_jit_level = (
        tf.OptimizerOptions.ON_1)

  run_config = tpu_config.RunConfig(
      cluster=tpu_cluster_resolver,
      evaluation_master=FLAGS.eval_master,
      model_dir=FLAGS.model_dir,
      log_step_count_steps=FLAGS.iterations_per_loop,
      session_config=config_proto,
      tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop,
                                      FLAGS.num_shards))

  # TPU Estimator
  if FLAGS.mode == 'train':
    train_estimator = tpu_estimator.TPUEstimator(
        model_fn=retinanet_model.retinanet_model_fn,
        use_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.train_batch_size,
        config=run_config,
        params=params)
    train_estimator.train(
        input_fn=dataloader.InputReader(FLAGS.training_file_pattern,
                                        is_training=True),
        max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                      FLAGS.train_batch_size))

    if FLAGS.eval_after_training:
      # Run evaluation after training finishes.
      eval_params = dict(
          params,
          use_tpu=False,
          input_rand_hflip=False,
          skip_crowd=False,
          resnet_checkpoint=None,
          is_training_bn=False,
          use_bfloat16=False,
      )
      eval_estimator = tpu_estimator.TPUEstimator(
          model_fn=retinanet_model.retinanet_model_fn,
          use_tpu=False,
          train_batch_size=FLAGS.train_batch_size,
          eval_batch_size=1,
          config=run_config,
          params=eval_params)
      eval_results = eval_estimator.evaluate(
          input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                          is_training=False),
          steps=FLAGS.eval_steps)
      tf.logging.info('Eval results: %s' % eval_results)

  elif FLAGS.mode == 'eval':
    # eval only runs on CPU or GPU host with batch_size = 1

    # Override the default options: disable randomization in the input pipeline
    # and don't run on the TPU.
    eval_params = dict(
        params,
        use_tpu=False,
        input_rand_hflip=False,
        skip_crowd=False,
        resnet_checkpoint=None,
        is_training_bn=False,
        use_bfloat16=False,
    )

    eval_estimator = tpu_estimator.TPUEstimator(
        model_fn=retinanet_model.retinanet_model_fn,
        use_tpu=False,
        eval_batch_size=1,
        train_batch_size=FLAGS.train_batch_size,
        config=run_config,
        params=eval_params)

    def terminate_eval():
      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
                      FLAGS.eval_timeout)
      return True

    # Run evaluation when there's a new checkpoint
    for ckpt in evaluation.checkpoints_iterator(
        FLAGS.model_dir,
        min_interval_secs=FLAGS.min_eval_interval,
        timeout=FLAGS.eval_timeout,
        timeout_fn=terminate_eval):

      tf.logging.info('Starting to evaluate.')
      try:
        eval_results = eval_estimator.evaluate(
            input_fn=dataloader.InputReader(FLAGS.validation_file_pattern,
                                            is_training=False),
            steps=FLAGS.eval_steps)
        tf.logging.info('Eval results: %s' % eval_results)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        total_step = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
                         FLAGS.train_batch_size)
        if current_step >= total_step:
          tf.logging.info('Evaluation finished after training step %d' %
                          current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint' %
                        ckpt)
  else:
    tf.logging.info('Mode not found.')