def main(unused_argv): flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tpu_cluster_resolver = ( tf.contrib.cluster_resolver.python.training.TPUClusterResolver( tpu_names=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards)) train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config=config, hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), pipeline_config_path=FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, eval_steps=FLAGS.num_eval_steps, use_tpu_estimator=True, use_tpu=FLAGS.use_tpu, num_shards=FLAGS.num_shards, batch_size=FLAGS.train_batch_size) estimator = train_and_eval_dict['estimator'] train_input_fn = train_and_eval_dict['train_input_fn'] eval_input_fn = train_and_eval_dict['eval_input_fn'] eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] train_steps = train_and_eval_dict['train_steps'] eval_steps = train_and_eval_dict['eval_steps'] if FLAGS.mode == 'train': estimator.train(input_fn=train_input_fn, max_steps=train_steps) # Continuously evaluating. if FLAGS.mode == 'eval': if FLAGS.eval_training_data: name = 'training_data' input_fn = eval_on_train_input_fn else: name = 'validation_data' input_fn = eval_input_fn model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn, eval_steps, train_steps, name)
def main(argv): del argv # Unused. params = params_dict.ParamsDict(retinanet_config.RETINANET_CFG, retinanet_config.RETINANET_RESTRICTIONS) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params.validate() params.lock() model_params = dict(params.as_dict(), use_tpu=FLAGS.use_tpu, mode=tf.estimator.ModeKeys.PREDICT, transpose_input=False) print(' - Setting up TPUEstimator...') estimator = tf.contrib.tpu.TPUEstimator( model_fn=serving.serving_model_fn_builder( FLAGS.use_tpu, FLAGS.output_image_info, FLAGS.output_normalized_coordinates, FLAGS.cast_num_detections_to_float), model_dir=None, config=tpu_config.RunConfig( tpu_config=tpu_config.TPUConfig(iterations_per_loop=1), master='local', evaluation_master='local'), params=model_params, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, export_to_tpu=FLAGS.use_tpu, export_to_cpu=True) print(' - Exporting the model...') input_type = FLAGS.input_type image_size = [int(x) for x in FLAGS.input_image_size.split(',')] export_path = estimator.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=functools.partial( serving.serving_input_fn, batch_size=FLAGS.batch_size, desired_image_size=image_size, stride=(2**params.anchor.max_level), input_type=input_type, input_name=FLAGS.input_name), checkpoint_path=FLAGS.checkpoint_path) print(' - Done! path: %s' % export_path)
def main(_): config = mask_rcnn_params.default_config() config = params_io.override_hparams(config, FLAGS.config) config.is_training_bn = False config.train_batch_size = FLAGS.batch_size config.eval_batch_size = FLAGS.batch_size model_params = dict( config.values(), use_tpu=FLAGS.use_tpu, mode=tf.estimator.ModeKeys.PREDICT, transpose_input=False) print(' - Setting up TPUEstimator...') estimator = tf.contrib.tpu.TPUEstimator( model_fn=mask_rcnn_model.mask_rcnn_model_fn, model_dir=FLAGS.model_dir, config=tpu_config.RunConfig( tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop), master='local', evaluation_master='local'), params=model_params, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, export_to_tpu=FLAGS.use_tpu, export_to_cpu=True, experimental_exported_model_uses_all_cores=FLAGS.inference_with_all_cores) print(' - Exporting the model...') input_type = FLAGS.input_type export_path = estimator.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=functools.partial( serving_inputs.serving_input_fn, batch_size=FLAGS.batch_size, desired_image_size=config.image_size, padding_stride=(2**config.max_level), input_type=input_type), checkpoint_path=FLAGS.checkpoint_path) if FLAGS.add_warmup_requests and input_type == 'image_bytes': inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, config.image_size, batch_sizes=[FLAGS.batch_size], image_format='JPEG', input_signature=serving_inputs.INPUT_SIGNATURE)
def main(unused_argv): del unused_argv # Unused if FLAGS.input_layout not in ['NCHW', 'NHWC']: raise RuntimeError('--input_layout must be one of [NCHW, NHWC]') run_config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, save_summary_steps=FLAGS.save_summary_steps, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement), tpu_config=tpu_config.TPUConfig(iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards)) inception_classifier = tpu_estimator.TPUEstimator( model_fn=inception_model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, batch_axis=(get_batch_axis(FLAGS.train_batch_size // FLAGS.num_shards), 0)) for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval): # tensors_to_log = { # 'learning_rate': 'learning_rate', # 'prediction_loss': 'prediction_loss', # 'train_accuracy': 'train_accuracy' # } # logging_hook = tf.train.LoggingTensorHook( # tensors=tensors_to_log, every_n_iter=100) tf.logging.info('Starting training cycle %d.' % cycle) inception_classifier.train(input_fn=ImageNetInput(True), steps=FLAGS.train_steps_per_eval) if FLAGS.eval_enabled: eval_steps = (imagenet.get_split_size('validation') // FLAGS.eval_batch_size) tf.logging.info('Starting evaluation cycle %d .' % cycle) eval_results = inception_classifier.evaluate( input_fn=ImageNetInput(False), steps=eval_steps) tf.logging.info('Evaluation results: %s' % eval_results)
def test_no_session_config_overwrite_with_cluster_spec(self): tf_config = { 'cluster': { run_config_lib.TaskType.CHIEF: ['host3:3'], run_config_lib.TaskType.WORKER: ['host3:4'] }, 'task': { 'type': run_config_lib.TaskType.CHIEF, 'index': 0 } } with _set_tf_config_env_variable(tf_config): session_config = config_pb2.ConfigProto(allow_soft_placement=True) run_config = tpu_config_lib.RunConfig( session_config=session_config) self.assertEqual(session_config, run_config.session_config)
def main(unused_argv): tf.logging.set_verbosity(FLAGS.log) hparams = HParams( batch_size=64, rnn_layer_sizes=[64, 64], dropout_keep_prob=0.5, skip_first_n_losses=0, clip_norm=5, initial_learning_rate=0.01, decay_steps=1000, decay_rate=0.95) use_fake_data = not FLAGS.sequence_example_file if not use_fake_data: sequence_example_file_paths = tf.gfile.Glob( os.path.expanduser(FLAGS.sequence_example_file)) tf.logging.info('Using real data from : %s', sequence_example_file_paths) input_fn = input_fn_by_record_files( sequence_example_file_paths, _INPUT_SIZE, padding_length=( FLAGS.static_padding_length if FLAGS.use_static_rnn else None)) else: tf.logging.info('Using fake data') input_fn = input_fn_by_dataset_with_fake_data( _INPUT_SIZE, padding_length=( FLAGS.static_padding_length if FLAGS.use_static_rnn else None)) run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards), ) model_fn = events_rnn_graph.build_model_fn(hparams) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, config=run_config, train_batch_size=hparams.batch_size, use_tpu=FLAGS.use_tpu) estimator.train(input_fn=input_fn, max_steps=FLAGS.num_training_steps)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards)) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size) estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
def train(working_dir, *tf_records, steps=None): if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=None, project=None) config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, model_dir=working_dir, save_checkpoints_steps=max(600, FLAGS.iterations_per_loop), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2)) # pylint: disable=line-too-long estimator = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.train_batch_size) def input_fn(params): return preprocessing.get_tpu_input_tensors(params['batch_size'], tf_records) # TODO: get hooks working again with TPUestimator. hooks = [] else: estimator = get_estimator(working_dir) def input_fn(): return preprocessing.get_input_tensors( FLAGS.train_batch_size, tf_records, filter_amount=1.0, shuffle_buffer_size=FLAGS.shuffle_buffer_size) hooks = [ UpdateRatioSessionHook(working_dir), EchoStepCounterHook(output_dir=working_dir) ] if steps is None: steps = EXAMPLES_PER_GENERATION // FLAGS.train_batch_size print("Training, steps = {}".format(steps)) estimator.train(input_fn, steps=int(steps), hooks=hooks)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.save_checkpoints_secs: if not FLAGS.eval_steps: tf.logging.info( "If checkpoint is expected, please set --save_checkpoints_secs." ) else: tf.logging.fatal( "Flag --save_checkpoints_secs must be set for evaluation. Aborting." ) if not FLAGS.train_file: tf.logging.fatal( "Flag --train_file must be set for training. Aborting.") if FLAGS.eval_steps and not FLAGS.eval_file: tf.logging.fatal( "Flag --eval_file must be set for evaluation. Aborting.") run_config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, config=run_config) estimator.train(input_fn=get_input_fn(FLAGS.train_file), max_steps=FLAGS.train_steps) if FLAGS.eval_steps: estimator.evaluate(input_fn=get_input_fn(FLAGS.eval_file), steps=FLAGS.eval_steps)
def _build_estimator(self, is_training): """Returns an Estimator object. Args: is_training: Boolean, whether or not we're in training mode. Returns: A tf.estimator.Estimator. """ config = self._config save_checkpoints_steps = config.logging.checkpoint.save_checkpoints_steps keep_checkpoint_max = self._config.logging.checkpoint.num_to_keep if is_training and config.use_tpu: iterations = config.tpu.iterations num_shards = config.tpu.num_shards run_config = tpu_config.RunConfig( save_checkpoints_secs=None, save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=keep_checkpoint_max, master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=self._logdir, tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations, num_shards=num_shards, per_host_input_for_training=num_shards <= 8), tf_random_seed=FLAGS.tf_random_seed) batch_size = config.data.batch_size return tpu_estimator.TPUEstimator( model_fn=self._get_model_fn(), config=run_config, use_tpu=True, train_batch_size=batch_size, eval_batch_size=batch_size) else: run_config = tf.estimator.RunConfig().replace( model_dir=self._logdir, save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=keep_checkpoint_max, tf_random_seed=FLAGS.tf_random_seed) return tf.estimator.Estimator( model_fn=self._get_model_fn(), config=run_config)
def test_create_tpu_estimator_and_inputs(self): """Tests that number of train/eval defaults to config values.""" run_config = tpu_config.RunConfig() hparams = model_hparams.create_hparams( hparams_overrides='load_pretrained=false') pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) train_steps = 20 train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config, hparams, pipeline_config_path, train_steps=train_steps, use_tpu_estimator=True) estimator = train_and_eval_dict['estimator'] train_steps = train_and_eval_dict['train_steps'] self.assertIsInstance(estimator, tpu_estimator.TPUEstimator) self.assertEqual(20, train_steps)
def main(argv): del argv # Unused. run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=3600, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size) estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
def build_run_config(): """Return RunConfig for TPU estimator.""" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) eval_steps = model_lib.NUM_EVAL_IMAGES // FLAGS.eval_batch_size iterations_per_loop = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations_per_loop) save_checkpoints_steps = FLAGS.save_checkpoints_steps or iterations_per_loop run_config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=None, tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_shards, per_host_input_for_training=tpu_config.InputPipelineConfig. PER_HOST_V2)) return run_config
def run_toy_model_tpu(): """Run a toy model on TPU.""" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) iterations_per_loop = FLAGS.iterations mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=None, # Disable the default saver save_checkpoints_secs=None, # Disable the default saver log_step_count_steps=iterations_per_loop, save_summary_steps=iterations_per_loop, tpu_config=tpu_config.TPUConfig( num_shards=mesh_shape.size, iterations_per_loop=iterations_per_loop, num_cores_per_replica=1, per_host_input_for_training=tpu_config.InputPipelineConfig. BROADCAST)) classifier = tpu_estimator.TPUEstimator(use_tpu=True, model_fn=model_fn, config=config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size) current_step = estimator_lib._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long logging.info('Current step %d', current_step) if FLAGS.steps_per_checkpoint == 0: classifier.train(input_fn=ToyModelInput(), max_steps=FLAGS.train_steps) return while current_step < FLAGS.train_steps: next_checkpoint = min(current_step + FLAGS.steps_per_checkpoint, FLAGS.train_steps) classifier.train(input_fn=ToyModelInput(), max_steps=next_checkpoint) current_step = next_checkpoint logging.info('Starting to evaluate.') eval_results = classifier.evaluate( input_fn=ToyModelInput(), steps=156 ) # since we have 10000 examples and batch_size = 64 per host logging.info('Eval results: %s', eval_results)
def main(argv): del argv training_examples = (FLAGS.train_epochs * 40000) eval_examples = 10000 iterations_per_loop = ((training_examples // 10) // FLAGS.train_batch_size) run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.steps_per_checkpoint, log_step_count_steps=iterations_per_loop, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, params=dict(CIFAR_SMALL_PARAMS, use_tpu=FLAGS.use_tpu), ) # Evaluate the test set after 5% of training examples are finished. for cycle in range(10): tf.logging.info("Starting %d train steps" % (training_examples // 10 // FLAGS.train_batch_size)) estimator.train( input_fn=InputReader(FLAGS.train_file, is_training=True), steps=training_examples // 10 // FLAGS.train_batch_size) tf.logging.info("Starting evaluation cycle %d ." % cycle) print(estimator.evaluate( input_fn=InputReader(FLAGS.train_file, is_training=False), steps=eval_examples // FLAGS.eval_batch_size, ))
def _test_warm_start(self, warm_start_from=None): """Tests whether WarmStartSettings work as intended.""" def generator_with_new_variable(noise_dict, mode): variable_scope.get_variable(name=self.new_variable_name, initializer=self.new_variable_value, trainable=True) return generator_fn(noise_dict, mode) est = estimator.TPUGANEstimator( generator_fn=generator_fn, discriminator_fn=discriminator_fn, generator_loss_fn=losses.wasserstein_generator_loss, discriminator_loss_fn=losses.wasserstein_discriminator_loss, generator_optimizer=training.GradientDescentOptimizer(1.0), discriminator_optimizer=training.GradientDescentOptimizer(1.0), train_batch_size=4, use_tpu=FLAGS.use_tpu, config=self._config) def train_input_fn(params): data = np.zeros([params['batch_size'], 4], dtype=np.float32) return data, data est.train(train_input_fn, steps=1) est_warm = estimator.TPUGANEstimator( generator_fn=generator_with_new_variable, discriminator_fn=discriminator_fn, generator_loss_fn=losses.wasserstein_generator_loss, discriminator_loss_fn=losses.wasserstein_discriminator_loss, generator_optimizer=training.GradientDescentOptimizer(1.0), discriminator_optimizer=training.GradientDescentOptimizer(1.0), config=tpu_config.RunConfig( model_dir=None if warm_start_from else self._model_dir), train_batch_size=4, use_tpu=FLAGS.use_tpu, warm_start_from=warm_start_from) est_warm.train(train_input_fn, steps=1) return est_warm
def testTrainingPipeline(self, training_method): output_directory = '/tmp/' g = tf.Graph() with g.as_default(): dataset = self._retrieve_data(is_training=False, data_dir=False) FLAGS.transpose_input = False FLAGS.use_tpu = False FLAGS.mode = 'train' FLAGS.mask_init_method = 'random' FLAGS.precision = 'float32' FLAGS.train_steps = 1 FLAGS.train_batch_size = 1 FLAGS.eval_batch_size = 1 FLAGS.steps_per_eval = 1 FLAGS.model_architecture = 'resnet' params = {} params['output_dir'] = output_directory params['training_method'] = training_method params['use_tpu'] = False set_lr_schedule() run_config = tpu_config.RunConfig(master=None, model_dir=None, save_checkpoints_steps=1, tpu_config=tpu_config.TPUConfig( iterations_per_loop=1, num_shards=1)) classifier = tpu_estimator.TPUEstimator( use_tpu=False, model_fn=resnet_model_fn_w_pruning, params=params, config=run_config, train_batch_size=1, eval_batch_size=1) classifier.train(input_fn=dataset.input_fn, max_steps=1)
def create_estimator(master, model_dir, use_tpu, iterations_per_loop, num_shards, model_params, include_features_in_predictions=True, decode_keys=(), train_init_checkpoint=None, train_warmup_steps=10000, save_checkpoints_steps=1000, keep_checkpoint_max=1): """Returns an tensorflow estimator.""" # Define GPU Config for session config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.8)) # % of GPU allocated config.gpu_options.allow_growth = False # This is the runtime config for tensorflow estimators run_config = tpu_config.RunConfig( master=master, model_dir=model_dir, session_config=config, tpu_config=tpu_config.TPUConfig(iterations_per_loop), save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=keep_checkpoint_max) return tpu_estimator.TPUEstimator( model_fn=_estimator_model_fn(use_tpu, model_params, model_dir, include_features_in_predictions, decode_keys, train_init_checkpoint, train_warmup_steps), use_tpu=use_tpu, # false train_batch_size=model_params.batch_size * num_shards, # batch_size * 1 by default eval_batch_size=model_params.batch_size * num_shards, # batch_size * 1 by default predict_batch_size=model_params.batch_size * num_shards, # batch_size * 1 by default config=run_config)
def main(unused_argv): config = tpu_config.RunConfig(master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, tpu_config=tpu_config.TPUConfig( iterations_per_loop=100, num_shards=8)) resnet_classifier = tpu_estimator.TPUEstimator( model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) for cycle in range(FLAGS.train_steps // FLAGS.steps_per_eval): tf.logging.info('Starting a training cycle.') resnet_classifier.train(input_fn=ImageNetInput(True), steps=FLAGS.steps_per_eval) _EVAL_STEPS = 50000 // FLAGS.eval_batch_size tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=ImageNetInput(False), steps=_EVAL_STEPS) tf.logging.info('Eval results: %s' % eval_results)
def main(unused_argv): assert len(unused_argv) == 1, ("Unrecognized command line arguments: %s" % unused_argv[1:]) start = time.time() tf.logging.set_verbosity(tf.logging.INFO) run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), #tpu_config=tpu_config.TPUConfig(5, FLAGS.num_shards, per_host_input_for_training = True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=64, config=run_config) estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps) total = time.time() - start print("Total time: " + str(total))
def main(argv): del argv training_examples = FLAGS.train_epochs * 40000 eval_examples = 10000 run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=training_examples // 10 // FLAGS.batch_size, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(CIFAR_SMALL_PARAMS, use_tpu=FLAGS.use_tpu), ) # Evaluate the test set after 10% of training examples are finished. for _ in range(10): estimator.train(input_fn=InputReader(FLAGS.train_file, is_training=True), steps=training_examples // 10) print( estimator.evaluate( input_fn=InputReader(FLAGS.train_file, is_training=False), steps=eval_examples, ))
def get_tpu_estimator(working_dir): tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=None, project=None) tpu_grpc_url = tpu_cluster_resolver.get_master() run_config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=working_dir, save_checkpoints_steps=max(1000, FLAGS.iterations_per_loop), save_summary_steps=FLAGS.summary_steps, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2)) return tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores, eval_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores)
def main(unused_argv): if FLAGS.use_tpu: if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError( "You must specify either --master or --tpu_name.") if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn( "Both --master and --tpu_name are set. Ignoring " "--tpu_name and using --master.") tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() else: # URL is unused if running locally without TPU tpu_grpc_url = None batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size steps_per_checkpoint = FLAGS.steps_per_checkpoint iterations_per_loop = FLAGS.iterations_per_loop eval_steps = _NUM_EVAL_IMAGES // FLAGS.eval_batch_size if iterations_per_loop is None or steps_per_checkpoint < iterations_per_loop: iterations_per_loop = steps_per_checkpoint if FLAGS.mode == "eval": iterations_per_loop = eval_steps params = { "batches_per_epoch": batches_per_epoch, } config = tpu_config.RunConfig(master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_steps=steps_per_checkpoint, log_step_count_steps=iterations_per_loop, tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_shards)) densenet_estimator = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, params=params) if FLAGS.mode == "train": tf.logging.info( "Training for %d steps (%.2f epochs in total)." % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch)) densenet_estimator.train(input_fn=ImageNetInput(True), max_steps=FLAGS.train_steps) elif FLAGS.mode == "train_and_eval": current_step = 0 tf.logging.info( "Training for %d steps (%.2f epochs in total). Current " "step %d" % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch, current_step)) while current_step < FLAGS.train_steps: next_checkpoint = min(current_step + steps_per_checkpoint, FLAGS.train_steps) num_steps = next_checkpoint - current_step current_step = next_checkpoint densenet_estimator.train(input_fn=ImageNetInput(True), steps=num_steps) tf.logging.info("Starting to evaluate.") eval_results = densenet_estimator.evaluate( input_fn=ImageNetInput(False), steps=_NUM_EVAL_IMAGES // FLAGS.eval_batch_size) tf.logging.info("Eval results: %s" % eval_results) else: def terminate_eval(): tf.logging.info( "Terminating eval after %d seconds of no checkpoints" % FLAGS.eval_timeout) return True # Run evaluation when there"s a new checkpoint # If the evaluation worker is delayed in processing a new checkpoint, # the checkpoint file may be deleted by the trainer before it can be # evaluated. # Ignore the error in this case. for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info("Starting to evaluate.") try: eval_results = densenet_estimator.evaluate( input_fn=ImageNetInput(False), steps=eval_steps, checkpoint_path=ckpt) tf.logging.info("Eval results: %s" % eval_results) except tf.errors.NotFoundError: tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint")
def main(argv): del argv if FLAGS.use_tpu: if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError( 'You must specify either --master or --tpu_name.') if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn( 'Both --master and --tpu_name are set. Ignoring ' '--tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() else: tpu_grpc_url = None config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, tpu_config=tpu_config.TPUConfig( num_shards=FLAGS.num_shards, iterations_per_loop=FLAGS.iterations_per_loop)) # Set module-level global variable so that model_fn and input_fn can be # identical for each different kind of dataset and model global dataset, model if FLAGS.dataset == 'mnist': dataset = mnist_input model = mnist_model elif FLAGS.dataset == 'cifar': dataset = cifar_input model = cifar_model else: raise ValueError('Invalid dataset: %s' % FLAGS.dataset) # TPU-based estimator used for TRAIN and EVAL est = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size) # CPU-based estimator used for PREDICT (generating images) cpu_est = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=False, config=config, predict_batch_size=_NUM_VIZ_IMAGES) tf.gfile.MakeDirs(os.path.join(FLAGS.model_dir, 'generated_images')) current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info('Starting training for %d steps, current step: %d' % (FLAGS.train_steps, current_step)) while current_step < FLAGS.train_steps: next_checkpoint = min(current_step + FLAGS.train_steps_per_eval, FLAGS.train_steps) est.train(input_fn=generate_input_fn(True), max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info('Finished training step %d' % current_step) if FLAGS.eval_loss: # Evaluate loss on test set metrics = est.evaluate(input_fn=generate_input_fn(False), steps=dataset.NUM_EVAL_IMAGES // FLAGS.batch_size) tf.logging.info('Finished evaluating') tf.logging.info(metrics) # Render some generated images generated_iter = cpu_est.predict(input_fn=noise_input_fn) images = [p['generated_images'][:, :, :] for p in generated_iter] assert len(images) == _NUM_VIZ_IMAGES image_rows = [ np.concatenate(images[i:i + 10], axis=0) for i in range(0, _NUM_VIZ_IMAGES, 10) ] tiled_image = np.concatenate(image_rows, axis=1) img = dataset.convert_array_to_image(tiled_image) step_string = str(current_step).zfill(5) file_obj = tf.gfile.Open( os.path.join(FLAGS.model_dir, 'generated_images', 'gen_%s.png' % (step_string)), 'w') img.save(file_obj, format='png') tf.logging.info('Finished generating images')
def main(unused_argv): del unused_argv # Unused if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError('You must specify either --master or --tpu_name.') if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn('Both --master and --tpu_name are set. Ignoring ' '--tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() batch_size_per_shard = FLAGS.train_batch_size // FLAGS.num_shards params = { 'input_perm': [0, 1, 2, 3], 'output_perm': [0, 1, 2, 3], } batch_axis = 0 if FLAGS.transpose_enabled: if batch_size_per_shard >= 64: params['input_perm'] = [3, 0, 1, 2] params['output_perm'] = [1, 2, 3, 0] batch_axis = 3 else: params['input_perm'] = [2, 0, 1, 3] params['output_perm'] = [1, 2, 0, 3] batch_axis = 2 if FLAGS.eval_total_size > 0: eval_size = FLAGS.eval_total_size else: eval_size = _NUM_EVAL_IMAGES eval_steps = eval_size // FLAGS.eval_batch_size iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations) eval_batch_size = (None if FLAGS.mode == 'train' else FLAGS.eval_batch_size) per_host_input_for_training = (FLAGS.num_shards <= 8 if FLAGS.mode == 'train' else True) run_config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, save_summary_steps=FLAGS.save_summary_steps, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement), tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations, num_shards=FLAGS.num_shards, per_host_input_for_training=per_host_input_for_training)) inception_classifier = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, params=params, train_batch_size=FLAGS.train_batch_size, eval_batch_size=eval_batch_size, batch_axis=(batch_axis, 0)) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = InputPipeline(is_training=True, data_dir=FLAGS.data_dir) imagenet_eval = InputPipeline(is_training=False, data_dir=FLAGS.data_dir) if FLAGS.moving_average: eval_hooks = [LoadEMAHook(FLAGS.model_dir)] else: eval_hooks = [] if FLAGS.mode == 'eval': def terminate_eval(): tf.logging.info('%d seconds without new checkpoints have elapsed ' '... terminating eval' % FLAGS.eval_timeout) return True def get_next_checkpoint(): return evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval) for checkpoint in get_next_checkpoint(): tf.logging.info('Starting to evaluate.') try: eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks, checkpoint_path=checkpoint) tf.logging.info('Evaluation results: %s' % eval_results) except tf.errors.NotFoundError: # skip checkpoint if it gets deleted prior to evaluation tf.logging.info('Checkpoint %s no longer exists ... skipping') elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval): tf.logging.info('Starting training cycle %d.' % cycle) inception_classifier.train(input_fn=imagenet_train.input_fn, steps=FLAGS.train_steps_per_eval) tf.logging.info('Starting evaluation cycle %d .' % cycle) eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) else: tf.logging.info('Starting training ...') inception_classifier.train(input_fn=imagenet_train.input_fn, steps=FLAGS.train_steps)
def main(unused_argv): del unused_argv # Unused tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', ( 'Invalid value for --precision flag; must be bfloat16 or float32.') tf.logging.info('Precision: %s', FLAGS.precision) params = { 'input_perm': [0, 1, 2, 3], 'output_perm': [0, 1, 2, 3], } batch_axis = 0 if FLAGS.transpose_enabled: params['input_perm'] = [3, 0, 1, 2] params['output_perm'] = [1, 2, 3, 0] batch_axis = 3 if FLAGS.eval_total_size > 0: eval_size = FLAGS.eval_total_size else: eval_size = _NUM_EVAL_IMAGES eval_steps = eval_size // FLAGS.eval_batch_size iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations) eval_batch_size = (None if FLAGS.mode == 'train' else FLAGS.eval_batch_size) per_host_input_for_training = (FLAGS.num_shards <= 8 if FLAGS.mode == 'train' else True) run_config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, save_summary_steps=FLAGS.save_summary_steps, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement), tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations, num_shards=FLAGS.num_shards, per_host_input_for_training=per_host_input_for_training)) inception_classifier = tpu_estimator.TPUEstimator( model_fn=inception_model_fn, use_tpu=FLAGS.use_tpu, config=run_config, params=params, train_batch_size=FLAGS.train_batch_size, eval_batch_size=eval_batch_size, batch_axis=(batch_axis, 0)) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. use_bfloat16 = FLAGS.precision == 'bfloat16' imagenet_train = InputPipeline(is_training=True, data_dir=FLAGS.data_dir, use_bfloat16=use_bfloat16) imagenet_eval = InputPipeline(is_training=False, data_dir=FLAGS.data_dir, use_bfloat16=use_bfloat16) if FLAGS.moving_average: eval_hooks = [LoadEMAHook(FLAGS.model_dir)] else: eval_hooks = [] if FLAGS.mode == 'eval': # Run evaluation when there is a new checkpoint for checkpoint in evaluation.checkpoints_iterator(FLAGS.model_dir): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # Includes compilation time eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks, checkpoint_path=checkpoint) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(checkpoint).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', checkpoint) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval): tf.logging.info('Starting training cycle %d.' % cycle) inception_classifier.train(input_fn=imagenet_train.input_fn, steps=FLAGS.train_steps_per_eval) tf.logging.info('Starting evaluation cycle %d .' % cycle) eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) else: tf.logging.info('Starting training ...') inception_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps)
def main(unused_argv): tpu_grpc_url = None tpu_cluster_resolver = None if FLAGS.use_tpu: # Determine the gRPC URL of the TPU device to use #if not FLAGS.master and not FLAGS.tpu_name: # raise RuntimeError('You must specify either --master or --tpu_name.') if FLAGS.master: if FLAGS.tpu_name: tf.logging.warn('Both --master and --tpu_name are set. Ignoring' ' --tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) else: # URL is unused if running locally without TPU tpu_grpc_url = None config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, cluster=tpu_cluster_resolver, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores)) # Prepare training and testing data dbpedia = tf.contrib.learn.datasets.load_dataset( 'dbpedia', size='large', test_with_fake_data=FLAGS.test_with_fake_data) print("Shuffling data set...") x_train = dbpedia.train.data[:, 1] y_train = dbpedia.train.target s = np.arange(len(y_train)) np.random.shuffle(s) x_train = x_train[s] y_train = y_train[s] print("Done!") x_train = pandas.Series(x_train) y_train = pandas.Series(y_train) x_test = pandas.Series(dbpedia.test.data[:, 1]) y_test = pandas.Series(dbpedia.test.target) print('Train data size:', x_train.shape) print('Test data size:', x_test.shape) # Process vocabulary char_processor = tf.contrib.learn.preprocessing.ByteProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(char_processor.fit_transform(x_train))) x_test = np.array(list(char_processor.transform(x_test))) # Build model #classifier = tf.estimator.Estimator(model_fn=char_rnn_model) classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=char_rnn_model, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) def TPU_train_input_fn(params): return tf.estimator.inputs.numpy_input_fn( x={CHARS_FEATURE: x_train}, y=y_train, batch_size=params['batch_size'], num_epochs=None, shuffle=True)() def TPU_test_input_fn(params): return tf.estimator.inputs.numpy_input_fn( x={CHARS_FEATURE: x_test}, y=y_test, batch_size=params['batch_size'], num_epochs=1, shuffle=False)() # Train. current_step = 0 while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) classifier.train( input_fn=TPU_train_input_fn, max_steps=next_checkpoint) current_step = next_checkpoint # Eval. tf.logging.info('Starting to evaluate.') eval_results = classifier.evaluate( input_fn=TPU_test_input_fn) tf.logging.info('Test eval results: %s' % eval_results) eval_results = classifier.evaluate( input_fn=TPU_train_input_fn) tf.logging.info('Test eval results: %s' % eval_results) scores = classifier.evaluate(input_fn=TPU_test_input_fn) print('Accuracy: {0:f}'.format(scores['accuracy']))
def main(argv): del argv # Unused. tf.enable_resource_variables() tf.set_random_seed(FLAGS.seed) set_lr_schedule() set_custom_sparsity_map() folder_stub = os.path.join(FLAGS.training_method, str(FLAGS.end_sparsity), str(FLAGS.maskupdate_begin_step), str(FLAGS.maskupdate_end_step), str(FLAGS.maskupdate_frequency), str(FLAGS.drop_fraction), str(FLAGS.label_smoothing), str(FLAGS.weight_decay)) output_dir = FLAGS.output_dir if FLAGS.use_folder_stub: output_dir = os.path.join(output_dir, folder_stub) export_dir = os.path.join(output_dir, 'export_dir') # we pass the updated eval and train string to the params dictionary. params = {} params['output_dir'] = output_dir params['training_method'] = FLAGS.training_method params['use_tpu'] = FLAGS.use_tpu dataset_func = functools.partial( imagenet_input.ImageNetInput, data_dir=FLAGS.data_directory, transpose_input=False, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=False) imagenet_train, imagenet_eval = [ dataset_func(is_training=is_training) for is_training in [True, False] ] run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=output_dir, save_checkpoints_steps=FLAGS.steps_per_checkpoint, keep_checkpoint_max=FLAGS.keep_checkpoint_max, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, tpu_job_name=FLAGS.tpu_job_name)) classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn_w_pruning, params=params, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) cpu_classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn_w_pruning, params=params, config=run_config, train_batch_size=FLAGS.train_batch_size, export_to_tpu=False, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.num_eval_images % FLAGS.eval_batch_size != 0: raise ValueError( 'eval_batch_size (%d) must evenly divide num_eval_images(%d)!' % (FLAGS.eval_batch_size, FLAGS.num_eval_images)) eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size if FLAGS.mode == 'eval_once': ckpt_path = os.path.join(output_dir, FLAGS.eval_once_ckpt_prefix) dataset = imagenet_train if FLAGS.eval_on_train else imagenet_eval classifier.evaluate(input_fn=dataset.input_fn, steps=eval_steps, checkpoint_path=ckpt_path, name='{0}'.format(FLAGS.eval_once_ckpt_prefix)) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator(output_dir): tf.logging.info('Starting to evaluate.') try: dataset = imagenet_train if FLAGS.eval_on_train else imagenet_eval classifier.evaluate(input_fn=dataset.input_fn, steps=eval_steps, checkpoint_path=ckpt, name='eval') # Terminate eval job when final checkpoint is reached global_step = int(os.path.basename(ckpt).split('-')[1]) if global_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % global_step) break except tf.errors.NotFoundError: logging('Checkpoint no longer exists,skipping checkpoint.') else: global_step = estimator._load_global_step_from_checkpoint_dir( output_dir) # Session run hooks to export model for prediction export_hook = ExportModelHook(cpu_classifier, export_dir) hooks = [export_hook] if FLAGS.mode == 'train': tf.logging.info('start training...') classifier.train(input_fn=imagenet_train.input_fn, hooks=hooks, max_steps=FLAGS.train_steps) else: assert FLAGS.mode == 'train_and_eval' tf.logging.info('start training and eval...') while global_step < FLAGS.train_steps: next_checkpoint = min(global_step + FLAGS.steps_per_eval, FLAGS.train_steps) classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) global_step = next_checkpoint logging('Completed training up to step :', global_step) classifier.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps)
def main(unused_argv): tpu_grpc_url = None tpu_cluster_resolver = None if FLAGS.use_tpu: # Determine the gRPC URL of the TPU device to use if not FLAGS.master and not FLAGS.tpu_name: raise RuntimeError( 'You must specify either --master or --tpu_name.') if FLAGS.master: if FLAGS.tpu_name: tf.logging.warn( 'Both --master and --tpu_name are set. Ignoring' ' --tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) else: # URL is unused if running locally without TPU tpu_grpc_url = None config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, cluster=tpu_cluster_resolver, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores)) resnet_classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = imagenet_input.ImageNetInput(is_training=True, data_dir=FLAGS.data_dir) imagenet_eval = imagenet_input.ImageNetInput(is_training=False, data_dir=FLAGS.data_dir) if FLAGS.mode == 'eval': eval_steps = NUM_EVAL_IMAGES // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator(FLAGS.model_dir): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d' % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size tf.logging.info('Training for %d steps (%.2f epochs in total). Current' ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch, current_step)) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be consistently excluded modulo the batch size. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.' % (FLAGS.train_steps, elapsed_time)) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn )
def main(argv): del argv # Unused. if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None if FLAGS.mode is 'train' and FLAGS.training_file_pattern is None: raise RuntimeError('You must specify --training_file_pattern for training.') if FLAGS.mode is 'eval': if FLAGS.valid_data_dir is None: raise RuntimeError('You must specify --valid_data_dir for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError('You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_shards=FLAGS.num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) config_proto = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) run_config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop, FLAGS.num_shards)) # TPU Estimator if FLAGS.mode == 'train': train_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=1, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'eval': # eval only runs on CPU or GPU host with batch_size = 1 # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, eval_batch_size=1, train_batch_size=FLAGS.train_batch_size, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info('Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info('Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: tf.logging.info('Mode not found.')