def testReturnsEmptyIfNoCheckpointsFound(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'no_checkpoints_found') num_found = 0 for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 0)
def testReturnsSingleCheckpointIfOneShardedCheckpoint(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'one_checkpoint_found_sharded') if not gfile.Exists(checkpoint_dir): gfile.MakeDirs(checkpoint_dir) global_step = variables.get_or_create_global_step() # This will result in 3 different checkpoint shard files. with ops.device('/cpu:0'): variables_lib.Variable(10, name='v0') with ops.device('/cpu:1'): variables_lib.Variable(20, name='v1') saver = saver_lib.Saver(sharded=True) with session_lib.Session( target='', config=config_pb2.ConfigProto(device_count={'CPU': 2})) as session: session.run(variables_lib.global_variables_initializer()) save_path = os.path.join(checkpoint_dir, 'model.ckpt') saver.save(session, save_path, global_step=global_step) num_found = 0 for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 1)
def testTimeoutFn(self): timeout_fn_calls = [0] def timeout_fn(): timeout_fn_calls[0] += 1 return timeout_fn_calls[0] > 3 results = list( evaluation.checkpoints_iterator( '/non-existent-dir', timeout=0.1, timeout_fn=timeout_fn)) self.assertEqual([], results) self.assertEqual(4, timeout_fn_calls[0])
def testTimeoutFn(self): timeout_fn_calls = [0] def timeout_fn(): timeout_fn_calls[0] += 1 return timeout_fn_calls[0] > 3 results = list( evaluation.checkpoints_iterator('/non-existent-dir', timeout=0.1, timeout_fn=timeout_fn)) self.assertEqual([], results) self.assertEqual(4, timeout_fn_calls[0])
def main(unused_argv): # pylint: disable=g-long-lambda if FLAGS.mode == "preprocess": prepare_dataset(FLAGS) elif FLAGS.mode == "train": print("Running training mode.") default_hparams = create_hparams(FLAGS) run_main(FLAGS, default_hparams, estimator.train_fn) elif FLAGS.mode == "train_and_eval": print("Running training and evaluation mode.") default_hparams = create_hparams(FLAGS) run_main(FLAGS, default_hparams, estimator.train_and_eval_with_low_level_api) else: print("Running inference mode.") default_hparams = create_hparams(FLAGS) current_epoch = 0 last_step = 0 # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator(FLAGS.out_dir): # Terminate eval job once target score is reached current_step = int(os.path.basename(ckpt).split("-")[1]) if current_step <= last_step: continue last_step = current_step tf.logging.info("Starting to evaluate...%s", ckpt) try: score = run_main(FLAGS, default_hparams, estimator.eval_fn) current_epoch += 1 if score > FLAGS.target_bleu: tf.logging.info( "Evaluation finished after training step %d" % current_step) break # Terminate eval job when final checkpoint is reached max_steps = default_hparams.num_train_steps if current_step >= max_steps: tf.logging.info( "Evaluation finished but failed to reach target score." ) break except tf.errors.NotFoundError: tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint" % ckpt)
def testReturnsSingleCheckpointIfOneCheckpointFound(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'one_checkpoint_found') if not gfile.Exists(checkpoint_dir): gfile.MakeDirs(checkpoint_dir) global_step = variables.get_or_create_global_step() saver = saver_lib.Saver() # Saves the global step. with self.test_session() as session: session.run(variables_lib.global_variables_initializer()) save_path = os.path.join(checkpoint_dir, 'model.ckpt') saver.save(session, save_path, global_step=global_step) num_found = 0 for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 1)
def evaluate(model_est, imagenet_eval, params): """Conducts eval and maybe export the model. Args: model_est: `TPUEstimator` instance for the discovered model imagenet_eval: Input pipeline for the validation set params: Dictionary containing parameters """ eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator(FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = model_est.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(model_est, FLAGS.export_dir)
def main(unused_argv): params = params_dict.ParamsDict(mnasnet_config.MNASNET_CFG, mnasnet_config.MNASNET_RESTRICTIONS) params = params_dict.override_params_dict(params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params = flags_to_params.override_params_from_input_flags(params, FLAGS) additional_params = { 'steps_per_epoch': params.num_train_images / params.train_batch_size, 'quantized_training': FLAGS.quantized_training, } params = params_dict.override_params_dict(params, additional_params, is_strict=False) params.validate() params.lock() if FLAGS.tpu or params.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None if params.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, params.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=params.iterations_per_loop, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long # Validates Flags. if params.precision == 'bfloat16' and params.use_keras: raise ValueError( 'Keras layers do not have full support to bfloat16 activation training.' ' You have set precision as %s and use_keras as %s' % (params.precision, params.use_keras)) # Initializes model parameters. mnasnet_est = tf.contrib.tpu.TPUEstimator( use_tpu=params.use_tpu, model_fn=mnasnet_model_fn, config=config, train_batch_size=params.train_batch_size, eval_batch_size=params.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu, params=params.as_dict()) if FLAGS.mode == 'export_only': export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize) return # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=False, transpose_input=params.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params.transpose_input, cache=params.use_cache and is_training, image_size=params.input_image_size, num_parallel_calls=params.num_parallel_calls, use_bfloat16=(params.precision == 'bfloat16')) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = params.num_eval_images // params.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( # pylint: disable=protected-access FLAGS.model_dir) tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params.train_steps, params.train_steps / params.steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, params.iterations_per_loop))) mnasnet_est.train(input_fn=imagenet_train.input_fn, max_steps=params.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < params.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params.train_steps) mnasnet_est.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.input_fn, steps=params.num_eval_images // params.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', params.train_steps, elapsed_time) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
def main(unused_argv): del unused_argv # Unused tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) params = { 'input_perm': [0, 1, 2, 3], 'output_perm': [0, 1, 2, 3], } batch_axis = 0 if FLAGS.transpose_enabled: params['input_perm'] = [3, 0, 1, 2] params['output_perm'] = [1, 2, 3, 0] batch_axis = 3 if FLAGS.eval_total_size > 0: eval_size = FLAGS.eval_total_size else: eval_size = _NUM_EVAL_IMAGES eval_steps = eval_size // FLAGS.eval_batch_size iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations) eval_batch_size = (None if FLAGS.mode == 'train' else FLAGS.eval_batch_size) tpu_config = contrib_tpu.TPUConfig(iterations_per_loop=iterations, num_shards=FLAGS.num_shards) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, save_summary_steps=FLAGS.save_summary_steps, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement), tpu_config=tpu_config) inception_classifier = contrib_tpu.TPUEstimator( model_fn=inception_model_fn, use_tpu=FLAGS.use_tpu, config=run_config, params=params, train_batch_size=FLAGS.train_batch_size, eval_batch_size=eval_batch_size, batch_axis=(batch_axis, 0)) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = InputPipeline(is_training=True, data_dir=FLAGS.data_dir) imagenet_eval = InputPipeline(is_training=False, data_dir=FLAGS.data_dir) if FLAGS.moving_average: eval_hooks = [LoadEMAHook(FLAGS.model_dir)] else: eval_hooks = [] if FLAGS.mode == 'eval': # Run evaluation when there is a new checkpoint for checkpoint in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # Includes compilation time eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks, checkpoint_path=checkpoint) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(checkpoint).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', checkpoint) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval): tf.logging.info('Starting training cycle %d.' % cycle) inception_classifier.train(input_fn=imagenet_train.input_fn, steps=FLAGS.train_steps_per_eval) tf.logging.info('Starting evaluation cycle %d .' % cycle) eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) else: tf.logging.info('Starting training ...') inception_classifier.train(input_fn=imagenet_train.input_fn, steps=FLAGS.train_steps) if FLAGS.export_dir is not None: tf.logging.info('Starting to export model.') inception_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=image_serving_input_fn)
params = dict(weight_decay=args.weight_decay) tpu_estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, config=run_config, train_batch_size=args.batch_size, eval_batch_size=args.batch_size, params=params) hooks = [] hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=args.model_dir, save_steps=iterations_per_loop)) train_input_fn = make_input_fn(data, labels) eval_input_fn = make_input_fn(test_data, test_labels) if pid > 0: tpu_estimator.train(input_fn=train_input_fn, steps=args.num_epochs * steps_per_epoch, hooks=hooks) # Sleep so that eval can finish before closing. time.sleep(360) else: for ckpt in evaluation.checkpoints_iterator(args.model_dir): eval_results = tpu_estimator.evaluate( input_fn=eval_input_fn, steps=len(test_data) // args.batch_size, checkpoint_path=ckpt) print("Eval results: %s" % eval_results)
def main(unused_argv): tpu_grpc_url = None tpu_cluster_resolver = None if FLAGS.use_tpu: # Determine the gRPC URL of the TPU device to use if not FLAGS.master and not FLAGS.tpu_name: raise RuntimeError( 'You must specify either --master or --tpu_name.') if FLAGS.master: if FLAGS.tpu_name: tf.logging.warn( 'Both --master and --tpu_name are set. Ignoring' ' --tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) else: # URL is unused if running locally without TPU tpu_grpc_url = None config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, cluster=tpu_cluster_resolver, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores)) resnet_classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = imagenet_input.ImageNetInput(is_training=True, data_dir=FLAGS.data_dir) imagenet_eval = imagenet_input.ImageNetInput(is_training=False, data_dir=FLAGS.data_dir) if FLAGS.mode == 'eval': eval_steps = NUM_EVAL_IMAGES // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator(FLAGS.model_dir): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d' % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size tf.logging.info('Training for %d steps (%.2f epochs in total). Current' ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch, current_step)) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be consistently excluded modulo the batch size. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.' % (FLAGS.train_steps, elapsed_time)) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn )
def get_next_checkpoint(): return evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval)
def _get_next_checkpoint(): return evaluation.checkpoints_iterator(FLAGS.model_dir, timeout=60 * 60 * 24, timeout_fn=_terminate_eval)
def main(argv): del argv # Unused. if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: tpu_cluster_resolver = None if FLAGS.mode is 'train' and FLAGS.training_file_pattern is None: raise RuntimeError('You must specify --training_file_pattern for training.') if FLAGS.mode is 'eval': if FLAGS.valid_data_dir is None: raise RuntimeError('You must specify --valid_data_dir for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError('You must specify --val_json_file for evaluation.') # Parse hparams hparams = retinanet_model.default_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_shards=FLAGS.num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) config_proto = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) run_config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config.TPUConfig(FLAGS.iterations_per_loop, FLAGS.num_shards)) # TPU Estimator if FLAGS.mode == 'train': train_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train( input_fn=dataloader.InputReader(FLAGS.training_file_pattern, is_training=True), max_steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size)) if FLAGS.eval_after_training: # Run evaluation after training finishes. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=1, config=run_config, params=eval_params) eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) elif FLAGS.mode == 'eval': # eval only runs on CPU or GPU host with batch_size = 1 # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. eval_params = dict( params, use_tpu=False, input_rand_hflip=False, skip_crowd=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = tpu_estimator.TPUEstimator( model_fn=retinanet_model.retinanet_model_fn, use_tpu=False, eval_batch_size=1, train_batch_size=FLAGS.train_batch_size, config=run_config, params=eval_params) def terminate_eval(): tf.logging.info('Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_steps) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: tf.logging.info('Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: tf.logging.info('Mode not found.')
def main(unused_argv): if FLAGS.use_tpu: # Determine the gRPC URL of the TPU device to use if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError( 'You must specify either --master or --tpu_name.') if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn( 'Both --master and --tpu_name are set. Ignoring' ' --tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() else: # URL is unused if running locally without TPU tpu_grpc_url = None config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.iterations_per_loop, keep_checkpoint_max=5, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, per_host_input_for_training=tpu_config.InputPipelineConfig. PER_HOST_V2)) resnet_classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = imagenet_input.ImageNetInput( is_training=True, data_dir=FLAGS.data_dir, num_parallel_calls=FLAGS.num_parallel_calls, use_transpose=FLAGS.use_transpose) imagenet_eval = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.data_dir, num_parallel_calls=FLAGS.num_parallel_calls, use_transpose=FLAGS.use_transpose) current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long steps_per_epoch = NUM_TRAIN_IMAGES // FLAGS.train_batch_size start_timestamp = time.time() current_epoch = current_step // steps_per_epoch if FLAGS.mode == 'train': resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps) training_time = time.time() - start_timestamp tf.logging.info('Finished training in %d seconds' % training_time) with tf.gfile.GFile(FLAGS.model_dir + '/total_time_%s.txt' % training_time, 'w') as f: # pylint: disable=line-too-long f.write('Total training time was %s seconds' % training_time) elif FLAGS.mode == 'eval': results = [] # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator(FLAGS.model_dir): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d' % (eval_results, elapsed_time)) current_step = int(os.path.basename(ckpt).split('-')[1]) current_epoch = current_step // steps_per_epoch results.append([ current_epoch, '{0:.2f}'.format(eval_results['top_1_accuracy'] * 100), '{0:.2f}'.format(eval_results['top_5_accuracy'] * 100), ]) # Terminate eval job when final checkpoint is reached if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) with tf.gfile.GFile(FLAGS.model_dir + '/epoch_results_eval.tsv', 'wb') as tsv_file: # pylint: disable=line-too-long writer = csv.writer(tsv_file, delimiter='\t') writer.writerow(['epoch', 'top1Accuracy', 'top5Accuracy']) writer.writerows(results) elif FLAGS.mode == 'train_and_eval': results = [] while current_epoch < 95: next_checkpoint = (current_epoch + 1) * steps_per_epoch resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_epoch += 1 tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.' % (next_checkpoint, int(time.time() - start_timestamp))) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished epoch %s at %s time' % (current_epoch, elapsed_time)) results.append([ current_epoch, elapsed_time / 3600.0, '{0:.2f}'.format(eval_results['top_1_accuracy'] * 100), '{0:.2f}'.format(eval_results['top_5_accuracy'] * 100), ]) with tf.gfile.GFile(FLAGS.model_dir + '/epoch_results_train_eval.tsv', 'wb') as tsv_file: # pylint: disable=line-too-long writer = csv.writer(tsv_file, delimiter='\t') writer.writerow(['epoch', 'hours', 'top1Accuracy', 'top5Accuracy']) writer.writerows(results) else: tf.logging.info('Mode not found.') if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
def main(_): if FLAGS.pruning_method in ['threshold']: folder_stub = os.path.join(FLAGS.pruning_method, str(FLAGS.end_sparsity), str(FLAGS.sparsity_begin_step), str(FLAGS.sparsity_end_step), str(FLAGS.pruning_frequency), str(FLAGS.label_smoothing)) elif FLAGS.pruning_method == 'variational_dropout': folder_stub = os.path.join(FLAGS.pruning_method, str(FLAGS.sparsity_begin_step), str(FLAGS.sparsity_end_step), str(FLAGS.reg_scalar), str(FLAGS.label_smoothing)) elif FLAGS.pruning_method == 'l0_regularization': folder_stub = os.path.join(FLAGS.pruning_method, str(FLAGS.sparsity_begin_step), str(FLAGS.sparsity_end_step), str(FLAGS.reg_scalar), str(FLAGS.label_smoothing)) elif FLAGS.pruning_method == 'baseline': folder_stub = os.path.join(FLAGS.pruning_method, str(0.0), str(0.0), str(0.0), str(0.0)) elif FLAGS.pruning_method == 'scratch': run_info = FLAGS.load_mask_dir.split('/') run_type = run_info[10] run_sparsity = run_info[11] run_begin = run_info[12] run_end = run_info[13] run_freq = run_info[14] run_label_smoothing = run_info[15] folder_stub = os.path.join(FLAGS.pruning_method, run_type, run_sparsity, run_begin, run_end, run_freq, run_label_smoothing, FLAGS.init_method) else: raise ValueError('Pruning method is not known %s' % (FLAGS.pruning_method)) output_dir = os.path.join(FLAGS.output_dir, folder_stub) export_dir = os.path.join(output_dir, 'export_dir') # we pass the updated eval and train string to the params dictionary. params = {} params['output_dir'] = output_dir params['pruning_method'] = FLAGS.pruning_method params['use_tpu'] = FLAGS.use_tpu params['log_alpha_threshold'] = FLAGS.log_alpha_threshold imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( # pylint: disable=g-complex-comprehension is_training=is_training, data_dir=FLAGS.data_directory, transpose_input=False, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=False) for is_training in [True, False] ] run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=output_dir, save_checkpoints_steps=FLAGS.steps_per_checkpoint, keep_checkpoint_max=FLAGS.keep_checkpoint_max, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, tpu_job_name=FLAGS.tpu_job_name)) classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn_w_pruning, params=params, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) cpu_classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn_w_pruning, params=params, config=run_config, train_batch_size=FLAGS.train_batch_size, export_to_tpu=False, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.num_eval_images % FLAGS.eval_batch_size != 0: raise ValueError( 'eval_batch_size (%d) must evenly divide num_eval_images(%d)!' % (FLAGS.eval_batch_size, FLAGS.num_eval_images)) eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size if FLAGS.mode == 'eval_once': ckpt = FLAGS.output_dir + 'model.ckpt-{}'.format(FLAGS.checkpoint_step) classifier.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt, name='{0}'.format( int(FLAGS.log_alpha_threshold * 10))) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator(output_dir): print('Starting to evaluate.') try: classifier.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt, name='{0}'.format( int(FLAGS.log_alpha_threshold * 10))) # Terminate eval job when final checkpoint is reached global_step = int(os.path.basename(ckpt).split('-')[1]) if global_step >= FLAGS.train_steps: print('Evaluation finished after training step %d' % global_step) break except tf.errors.NotFoundError: logging('Checkpoint no longer exists,skipping checkpoint.') else: global_step = tf.estimator._load_global_step_from_checkpoint_dir( output_dir) # pylint: disable=protected-access,line-too-long # Session run hooks to export model for prediction export_hook = ExportModelHook(cpu_classifier, export_dir) hooks = [export_hook] if FLAGS.mode == 'train': print('start training...') classifier.train(input_fn=imagenet_train.input_fn, hooks=hooks, max_steps=FLAGS.train_steps) else: assert FLAGS.mode == 'train_and_eval' print('start training and eval...') while global_step < FLAGS.train_steps: next_checkpoint = min(global_step + FLAGS.steps_per_eval, FLAGS.train_steps) classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) global_step = next_checkpoint logging('Completed training up to step :', global_step) classifier.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) # RevNet specific configuration config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset) if FLAGS.use_tpu: tf.logging.info("Using TPU.") tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None # TPU specific configuration tpu_config = tf.contrib.tpu.TPUConfig( # Recommended to be set as number of global steps for next checkpoint iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards) # Estimator specific configuration run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config, ) # Construct TPU Estimator estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.tpu_batch_size, eval_batch_size=config.tpu_eval_batch_size, config=run_config, params={"config": config}) # Construct input functions train_input_fn = get_input_fn( config=config, data_dir=FLAGS.data_dir, split="train_all") eval_input_fn = get_input_fn( config=config, data_dir=FLAGS.data_dir, split="test") # Disabling a range within an else block currently doesn't work # due to https://github.com/PyCQA/pylint/issues/872 # pylint: disable=protected-access if FLAGS.mode == "eval": # TPUEstimator.evaluate *requires* a steps argument. # Note that the number of examples used during evaluation is # --eval_steps * --batch_size. # So if you change --batch_size then change --eval_steps too. eval_steps = 10000 // config.tpu_eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info("Starting to evaluate.") try: start_timestamp = time.time() # This time will include compilation time eval_results = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Eval results: %s. Elapsed seconds: %d" % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split("-")[1]) if current_step >= config.max_train_iter: tf.logging.info( "Evaluation finished after training step %d" % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint" % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator_._load_global_step_from_checkpoint_dir( FLAGS.model_dir) tf.logging.info("Training for %d steps . Current" " step %d." % (config.max_train_iter, current_step)) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == "train": estimator.train(input_fn=train_input_fn, max_steps=config.max_train_iter) else: eval_steps = 10000 // config.tpu_eval_batch_size assert FLAGS.mode == "train_and_eval" while current_step < config.max_train_iter: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, config.max_train_iter) estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint) current_step = next_checkpoint # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be consistently excluded modulo the batch size. tf.logging.info("Starting to evaluate.") eval_results = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps) tf.logging.info("Eval results: %s" % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % (config.max_train_iter, elapsed_time))
def testMonitorCheckpointsLoopTimeout(self): ret = list( evaluation_lib.checkpoints_iterator( '/non-existent-dir', timeout=0)) self.assertEqual(ret, [])
def wait_for_checkpoint(path): from tensorflow.contrib.training.python.training import evaluation return evaluation.checkpoints_iterator(path)
def main(unused_argv): tf.flags.mark_flag_as_required('model_dir') tf.flags.mark_flag_as_required('pipeline_config_path') if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError('You must specify either --master or --tpu_name.') if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn('Both --master and --tpu_name are set. Ignoring ' '--tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.python.training.TPUClusterResolver( tpu_names=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards)) params = {} estimator, train_input_fn, eval_input_fn, train_steps, eval_steps = ( create_estimator( config, model_hparams.create_hparams(), FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, eval_steps=FLAGS.num_eval_steps, train_batch_size=FLAGS.train_batch_size, use_tpu=FLAGS.use_tpu, num_shards=FLAGS.num_shards, params=params)) if FLAGS.mode in ['train', 'train_and_eval']: estimator.train(input_fn=train_input_fn, max_steps=train_steps) if FLAGS.mode == 'train_and_eval': # Eval one time. eval_results = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) tf.logging.info('Eval results: %s' % eval_results) # Continuously evaluating. if FLAGS.mode == 'eval': def terminate_eval(): tf.logging.info('Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout_secs) return True # Run evaluation when there's a new checkpoint. for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval_secs, timeout=FLAGS.eval_timeout_secs, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt)
def main(unused_argv): # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '', # zone=FLAGS.tpu_zone, # project=FLAGS.gcp_project) if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) NUM_GPUS = len(get_available_gpus()) distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=NUM_GPUS) gpu_options = tf.GPUOptions(allow_growth=True) # config = tf.contrib.tpu.RunConfig( # cluster=tpu_cluster_resolver, # model_dir=FLAGS.model_dir, # save_checkpoints_steps=save_checkpoints_steps, # log_step_count_steps=FLAGS.log_step_count_steps, # session_config=tf.ConfigProto( # graph_options=tf.GraphOptions( # rewrite_options=rewriter_config_pb2.RewriterConfig( # disable_meta_optimizer=True))), # tpu_config=tf.contrib.tpu.TPUConfig( # iterations_per_loop=FLAGS.iterations_per_loop, # per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig # .PER_HOST_V2)) # pylint: disable=line-too-long config = tf.estimator.RunConfig( # cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto(allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), gpu_options=gpu_options), train_distribute=distribution, # tpu_config=tf.contrib.tpu.TPUConfig( # iterations_per_loop=FLAGS.iterations_per_loop, # per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig # .PER_HOST_V2) ) # Initializes model parameters. # params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size) # model_est = tf.estimator.Estimator( # use_tpu=FLAGS.use_tpu, # model_fn=final_model_fn, # config=config, # train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size, # export_to_tpu=FLAGS.export_to_tpu, # params=params) params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, batch_size=FLAGS.train_batch_size) model_est = tf.estimator.Estimator( model_fn=final_model_fn, config=config, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=False, transpose_input=FLAGS.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)]] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=FLAGS.input_image_size, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=False) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # This time will include compilation time eval_results = model_est.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(model_est, FLAGS.export_dir, FLAGS.post_quantize) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) model_est.train( input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) model_est.train( input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = model_est.evaluate( input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(model_est, FLAGS.export_dir, FLAGS.post_quantize)
def main(unused_argv): model_config.show_info() train_config.show_info() preproc_config.show_info() ## ckpt dir create now = datetime.utcnow().strftime("%Y%m%d%H%M%S") curr_model_dir = "{}/run-{}/".format(FLAGS.model_dir, now) curr_model_dir_local= "{}/run-{}/".format(EXPORT_MODEL_DIR,now) tf.logging.info('[main] data dir = %s'%FLAGS.data_dir) tf.logging.info('[main] model dir = %s'%curr_model_dir) tf.logging.info('[main] config logging dir = %s'%curr_model_dir_local) tf.logging.info('------------------------') if not tf.gfile.Exists(curr_model_dir): tf.gfile.MakeDirs(curr_model_dir) if not tf.gfile.Exists(curr_model_dir_local): tf.gfile.MakeDirs(curr_model_dir_local) FLAGS.model_dir = curr_model_dir # # logging config information tf.logging.info(str(train_config_dict)) tf.logging.info(str(model_config_dict)) tf.logging.info(str(preproc_config_dict)) train_config_filename = curr_model_dir_local + 'train_config' + '.json' model_config_filename = curr_model_dir_local + 'model_config' + '.json' preproc_config_filename = curr_model_dir_local + 'preproc_config' + '.json' with open(train_config_filename, 'w') as fp: json.dump(str(train_config_dict), fp) with open(model_config_filename, 'w') as fp: json.dump(str(model_config_dict), fp) with open(preproc_config_filename, 'w') as fp: json.dump(str(preproc_config_dict), fp) try: cmd = "sudo gsutil cp -r {} {}".format(curr_model_dir_local + '* ', curr_model_dir) print ('[main] cmd=%s'%cmd) check_output(cmd,shell=True) tf.logging.info('[main] success logging config in bucket') except: tf.logging.info('[main] failure logging config in bucket') # for CPU or GPU use config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions(allow_growth=True)) config = tf.estimator.RunConfig( model_dir =FLAGS.model_dir, tf_random_seed =None, save_summary_steps =FLAGS.summary_step, save_checkpoints_steps =max(600, FLAGS.iterations_per_loop), session_config = config, keep_checkpoint_max =5, keep_checkpoint_every_n_hours =10000, log_step_count_steps =FLAGS.log_step_count_steps, train_distribute =None) dontbeturtle_estimator = tf.estimator.Estimator( model_dir = FLAGS.model_dir, model_fn = model_fn, config = config, params = None, warm_start_from = None) ''' # data loader # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. ''' dataset_train, dataset_eval = \ [data_loader_coco.DataSetInput( is_training =is_training, data_dir =FLAGS.data_dir, transpose_input =FLAGS.transpose_input, use_bfloat16 =False) for is_training in [True, False]] if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # This time will include compilation time eval_results = dontbeturtle_estimator.evaluate( input_fn =dataset_eval.input_fn, steps =eval_steps, checkpoint_path =ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d' % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the GPU worker, # sometimes the GPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir) # pylint: disable=protected-access,line-too-long batchnum_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size tf.logging.info('[main] num_train_images=%s' % FLAGS.num_train_images) tf.logging.info('[main] train_batch_size=%s' % FLAGS.train_batch_size) tf.logging.info('[main] batchnum_per_epoch=%s' % batchnum_per_epoch) tf.logging.info('[main] Training for %d steps (%.2f epochs in total). Current' ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps / batchnum_per_epoch, current_step)) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == 'train': dontbeturtle_estimator.train( input_fn =dataset_train.input_fn, max_steps =FLAGS.train_steps) tf.logging.info('[main] Training only') else: assert FLAGS.mode == 'train_and_eval' tf.logging.info('[main] Training and Evaluation') while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) dontbeturtle_estimator.train( input_fn =dataset_train.input_fn, max_steps =next_checkpoint) current_step = next_checkpoint # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be consistently excluded modulo the batch size. tf.logging.info('Starting to evaluate.') eval_results = dontbeturtle_estimator.evaluate( input_fn =dataset_eval.input_fn, steps =FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.' % (FLAGS.train_steps, elapsed_time))
def main(unused_argv): params = params_dict.ParamsDict(resnet_config.RESNET_CFG, resnet_config.RESNET_RESTRICTIONS) params = params_dict.override_params_dict(params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params = flags_to_params.override_params_from_input_flags(params, FLAGS) params.validate() params.lock() # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # FLAGS.tpu if (FLAGS.tpu or params.use_tpu) else '', # zone=FLAGS.tpu_zone, # project=FLAGS.gcp_project) tpu_address = '' if 'COLAB_TPU_ADDR' not in os.environ: print( 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!' ) else: tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR'] with tf.Session(tpu_address) as sess: with open('/content/adc.json', 'r') as f: auth_info = json.load(f) tf.contrib.cloud.configure_gcs(sess, credentials=auth_info) if params.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(5000, params.iterations_per_loop) config = tf.contrib.tpu.RunConfig( # cluster=tpu_cluster_resolver, master=tpu_address, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=params.iterations_per_loop, num_shards=params.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long warm_start_settings = None if FLAGS.warm_start_from: warm_start_settings = tf.estimator.WarmStartSettings( FLAGS.warm_start_from, vars_to_warm_start='^(?!.*dense)') resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=params.use_tpu, warm_start_from=warm_start_settings, model_fn=resnet_model_fn, config=config, params=params.as_dict(), train_batch_size=params.train_batch_size, eval_batch_size=params.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu) assert (params.precision == 'bfloat16' or params.precision == 'float32'), ('Invalid value for precision parameter; ' 'must be bfloat16 or float32.') tf.logging.info('Precision: %s', params.precision) use_bfloat16 = params.precision == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=use_bfloat16, transpose_input=params.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params.transpose_input, cache=params.use_cache and is_training, image_size=params.image_size, num_parallel_calls=params.num_parallel_calls, include_background_label=(params.num_label_classes == 1001), use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = params.num_train_images // params.train_batch_size eval_steps = params.num_eval_images // params.eval_batch_size if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long steps_per_epoch = params.num_train_images // params.train_batch_size tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params.train_steps, params.train_steps / steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(5000, params.iterations_per_loop))) if FLAGS.profile_every_n_steps > 0: hooks.append( tpu_profiler_hook.TPUProfilerHook( save_steps=FLAGS.profile_every_n_steps, output_dir=FLAGS.model_dir, tpu=FLAGS.tpu)) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=params.train_steps, hooks=hooks) elif FLAGS.mode == 'train_and_eval': while current_step < params.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params.train_steps) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=int(next_checkpoint)) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=params.num_eval_images // params.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', params.train_steps, elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') export_path = resnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn ) if FLAGS.add_warmup_requests: inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, params.image_size, batch_sizes=FLAGS.inference_batch_sizes, image_format='JPEG')
def main(unused_argv): tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=max(600, FLAGS.iterations_per_loop), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = tpu_estimator.TPUEstimator( export_to_tpu=False, use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', ( 'Invalid value for --precision flag; must be bfloat16 or float32.') tf.logging.info('Precision: %s', FLAGS.precision) use_bfloat16 = FLAGS.precision == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput(is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, use_bfloat16=use_bfloat16) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d' % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long batches_per_epoch = FLAGS.num_train_images / FLAGS.train_batch_size tf.logging.info('Training for %d steps (%.2f epochs in total). Current' ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch, current_step)) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be consistently excluded modulo the batch size. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.' % (FLAGS.train_steps, elapsed_time)) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn )
def main(_): tf.logging.set_verbosity(tf.logging.INFO) # RevNet specific configuration revnet_config = { "revnet-56": config_.get_hparams_imagenet_56(), "revnet-104": config_.get_hparams_imagenet_104() }[FLAGS.revnet_config] if FLAGS.use_tpu: revnet_config.data_format = "channels_last" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Estimator specific configuration config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2), ) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, use_bfloat16=False) for is_training in [True, False] ] revnet_classifier = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=revnet_config.tpu_batch_size, eval_batch_size=revnet_config.tpu_eval_batch_size, config=config, export_to_tpu=False, params={"revnet_config": revnet_config}) steps_per_epoch = revnet_config.tpu_iters_per_epoch eval_steps = revnet_config.tpu_eval_steps # pylint: disable=protected-access if FLAGS.mode == "eval": # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info("Starting to evaluate.") try: start_timestamp = time.time() # This time will include compilation time eval_results = revnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Eval results: %s. Elapsed seconds: %d" % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split("-")[1]) if current_step >= revnet_config.max_train_iter: tf.logging.info( "Evaluation finished after training step %d" % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint" % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) tf.logging.info( "Training for %d steps (%.2f epochs in total). Current" " step %d." % (revnet_config.max_train_iter, revnet_config.max_train_iter / steps_per_epoch, current_step)) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == "train": revnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=revnet_config.max_train_iter) else: assert FLAGS.mode == "train_and_eval" while current_step < revnet_config.max_train_iter: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, revnet_config.max_train_iter) revnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % (next_checkpoint, int(time.time() - start_timestamp))) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info("Starting to evaluate.") eval_results = revnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps) tf.logging.info("Eval results: %s" % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % (revnet_config.max_train_iter, elapsed_time)) if FLAGS.export_dir is not None: # The guide to serve an exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info("Starting to export model.") revnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
def main(_): config = process_config(FLAGS.config_path) print(config) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): deploy_config = deploy.DeploymentConfig(num_clones=1) global_step = tf.Variable(0, trainable=False, name='global_step') # select model and build net net = tdr2n2.Unet(config) # create batch dataset with tf.device(deploy_config.inputs_device()): data = DataGenerator(config.input) x_test, y_test = data.get_eval_data() x_test = tf.expand_dims(x_test, -1) x_test.set_shape([ None, config.input.img_out_shape[0], config.input.img_out_shape[1], config.input.img_out_shape[2] ]) y_test.set_shape([ None, config.input.mask_out_shape[0], config.input.mask_out_shape[1] ]) y_test = tf.cast(y_test, tf.int32) y_test_hot = tf.one_hot(y_test, depth=config.network.num_classes, axis=-1) f_score, end_points = net.net(x_test) f_score_img = tf.expand_dims( tf.cast(tf.argmax(f_score, axis=-1), tf.float32) * 50., -1) y_test_img = tf.expand_dims( tf.cast(tf.argmax(y_test_hot, axis=-1), tf.float32) * 50., -1) ## add precision and recall f_score = tf.cast(tf.argmax(f_score, -1), tf.int32) #f_score = tf.image.resize_bilinear(f_score, (config.input.img_out_shape[0])) f_score = tf.one_hot(f_score, depth=config.network.num_classes, axis=-1) pred = tf.reduce_sum(f_score * y_test_hot, axis=(0, 1, 2)) all_pred = tf.reduce_sum(f_score, axis=(0, 1, 2)) + 1e-5 all_true = tf.reduce_sum(y_test_hot, axis=(0, 1, 2)) + 1e-5 # Variables to restore: moving avg. or normal weights. if config.train.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( config.train.moving_average_decay, global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[global_step.op.name] = global_step else: variables_to_restore = slim.get_variables_to_restore() saver = None if variables_to_restore is not None: saver = tf_saver.Saver(variables_to_restore) # =================================================================== # # Evaluation loop. # =================================================================== # gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=config.deploy.gpu_memory_fraction) configproto = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True, ) merged = tf.summary.merge_all() sum_writer = tf.summary.FileWriter(logdir=config.summary.test_dir) for checkpoint_path in evaluation.checkpoints_iterator( config.finetune.eval_checkpoint_dir): with tf.Session(config=configproto) as session: session.run(tf.global_variables_initializer()) session.run(data.get_iterator(is_train=False).initializer) saver.restore(session, checkpoint_path) logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) k = 1 tp = [] tp_fp = [] tp_fn = [] imgs = [] while True: try: pred_, all_pred_, all_true_, pred_img, true_img, g_step = session.run( [ pred, all_pred, all_true, f_score_img, y_test_img, global_step ]) tp.append(np.expand_dims(pred_, 0)) tp_fp.append(np.expand_dims(all_true_, 0)) tp_fn.append(np.expand_dims(all_pred_, 0)) #img = util.merge_pics(pred_img, true_img) print("deal with {} images".format( k * config.input.batch_size)) k += 1 except tf.errors.OutOfRangeError: tp_ = np.sum(np.concatenate(tp, 0), 0) tp_fn_ = np.sum(np.concatenate(tp_fn, 0), 0) tp_fp_ = np.sum(np.concatenate(tp_fp, 0), 0) precison = tp_ / tp_fp_ recall = tp_ / tp_fn_ dice = 2 * tp_ / (tp_fp_ + tp_fn_) print(precison) print(recall) print(dice) summary = tf.Summary() for i in range(recall.shape[0]): summary.value.add( tag='evaluation/{}th_class_precision'.format( i), simple_value=precison[i]) summary.value.add( tag='evaluation/{}th_class_recall'.format(i), simple_value=recall[i]) summary.value.add( tag='evaluation/{}th_class_dice'.format(i), simple_value=dice[i]) sum_writer.add_summary(summary, g_step) break logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
def main(argv): FLAGS = argv[0] # pylint:disable=invalid-name,redefined-outer-name tf.logging.set_verbosity(tf.logging.INFO) # RevNet specific configuration config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset) if FLAGS.use_tpu: tf.logging.info("Using TPU.") tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None # TPU specific configuration tpu_config = tf.contrib.tpu.TPUConfig( # Recommended to be set as number of global steps for next checkpoint iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards) # Estimator specific configuration run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config, ) # Construct TPU Estimator estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=config.tpu_batch_size, eval_batch_size=config.tpu_eval_batch_size, config=run_config, params={ "FLAGS": FLAGS, "config": config, }) # Construct input functions train_input_fn = get_input_fn( config=config, data_dir=FLAGS.data_dir, split="train_all") eval_input_fn = get_input_fn( config=config, data_dir=FLAGS.data_dir, split="test") # Disabling a range within an else block currently doesn't work # due to https://github.com/PyCQA/pylint/issues/872 # pylint: disable=protected-access if FLAGS.mode == "eval": # TPUEstimator.evaluate *requires* a steps argument. # Note that the number of examples used during evaluation is # --eval_steps * --batch_size. # So if you change --batch_size then change --eval_steps too. eval_steps = 10000 // config.tpu_eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info("Starting to evaluate.") try: start_timestamp = time.time() # This time will include compilation time eval_results = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Eval results: %s. Elapsed seconds: %d" % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split("-")[1]) if current_step >= config.max_train_iter: tf.logging.info( "Evaluation finished after training step %d" % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint" % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator_._load_global_step_from_checkpoint_dir( FLAGS.model_dir) tf.logging.info("Training for %d steps . Current" " step %d." % (config.max_train_iter, current_step)) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == "train": estimator.train(input_fn=train_input_fn, max_steps=config.max_train_iter) else: eval_steps = 10000 // config.tpu_eval_batch_size assert FLAGS.mode == "train_and_eval" while current_step < config.max_train_iter: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, config.max_train_iter) estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint) current_step = next_checkpoint # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be consistently excluded modulo the batch size. tf.logging.info("Starting to evaluate.") eval_results = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps) tf.logging.info("Eval results: %s" % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % (config.max_train_iter, elapsed_time))
def main(unused_argv): params = resnet_params.from_file(FLAGS.param_file) params = resnet_params.override(params, FLAGS.param_overrides) resnet_params.log_hparams_to_model_dir(params, FLAGS.model_dir) tf.logging.info('Model params: {}'.format(params)) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if params['use_async_checkpointing']: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, params['iterations_per_loop']) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=params['iterations_per_loop'], num_shards=params['num_cores'], per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long if FLAGS.inference_with_all_cores: resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=params['use_tpu'], model_fn=resnet_model_fn, config=config, params=params, train_batch_size=params['train_batch_size'], eval_batch_size=params['eval_batch_size'], export_to_tpu=FLAGS.export_to_tpu, experimental_exported_model_uses_all_cores=FLAGS. inference_with_all_cores) else: resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=params['use_tpu'], model_fn=resnet_model_fn, config=config, params=params, train_batch_size=params['train_batch_size'], eval_batch_size=params['eval_batch_size'], export_to_tpu=FLAGS.export_to_tpu) assert (params['precision'] == 'bfloat16' or params['precision'] == 'float32'), ('Invalid value for precision parameter; ' 'must be bfloat16 or float32.') tf.logging.info('Precision: %s', params['precision']) use_bfloat16 = params['precision'] == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=use_bfloat16, transpose_input=params['transpose_input'], selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params['transpose_input'], cache=params['use_cache'] and is_training, image_size=params['image_size'], num_parallel_calls=params['num_parallel_calls'], use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = params['num_train_images'] // params['train_batch_size'] eval_steps = params['num_eval_images'] // params['eval_batch_size'] if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params['train_steps']: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long steps_per_epoch = params['num_train_images'] // params[ 'train_batch_size'] tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params['train_steps'], params['train_steps'] / steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params['use_async_checkpointing']: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, params['iterations_per_loop']))) if FLAGS.profile_every_n_steps > 0: hooks.append( tpu_profiler_hook.TPUProfilerHook( save_steps=FLAGS.profile_every_n_steps, output_dir=FLAGS.model_dir, tpu=FLAGS.tpu)) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=params['train_steps'], hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < params['train_steps']: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params['train_steps']) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=params['num_eval_images'] // params['eval_batch_size']) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', params['train_steps'], elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') export_path = resnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn ) if FLAGS.add_warmup_requests: inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, params['image_size'], batch_sizes=FLAGS.inference_batch_sizes, image_format='JPEG')
def main(_): tf.logging.set_verbosity(tf.logging.INFO) # RevNet specific configuration revnet_config = { "revnet-56": config_.get_hparams_imagenet_56(), "revnet-104": config_.get_hparams_imagenet_104() }[FLAGS.revnet_config] if FLAGS.use_tpu: revnet_config.data_format = "channels_last" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Estimator specific configuration config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2), ) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput(is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, use_bfloat16=False) for is_training in [True, False] ] revnet_classifier = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=revnet_config.tpu_batch_size, eval_batch_size=revnet_config.tpu_eval_batch_size, config=config, export_to_tpu=False, params={"revnet_config": revnet_config}) steps_per_epoch = revnet_config.tpu_iters_per_epoch eval_steps = revnet_config.tpu_eval_steps # pylint: disable=protected-access if FLAGS.mode == "eval": # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info("Starting to evaluate.") try: start_timestamp = time.time( ) # This time will include compilation time eval_results = revnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Eval results: %s. Elapsed seconds: %d" % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split("-")[1]) if current_step >= revnet_config.max_train_iter: tf.logging.info( "Evaluation finished after training step %d" % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint" % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) tf.logging.info( "Training for %d steps (%.2f epochs in total). Current" " step %d." % (revnet_config.max_train_iter, revnet_config.max_train_iter / steps_per_epoch, current_step)) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == "train": revnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=revnet_config.max_train_iter) else: assert FLAGS.mode == "train_and_eval" while current_step < revnet_config.max_train_iter: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, revnet_config.max_train_iter) revnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( "Finished training up to step %d. Elapsed seconds %d." % (next_checkpoint, int(time.time() - start_timestamp))) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info("Starting to evaluate.") eval_results = revnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps) tf.logging.info("Eval results: %s" % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( "Finished training up to step %d. Elapsed seconds %d." % (revnet_config.max_train_iter, elapsed_time)) if FLAGS.export_dir is not None: # The guide to serve an exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info("Starting to export model.") revnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn )
def testMonitorCheckpointsLoopTimeout(self): ret = list( evaluation_lib.checkpoints_iterator('/non-existent-dir', timeout=0)) self.assertEqual(ret, [])
def main(unused_argv): del unused_argv # Unused tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', ( 'Invalid value for --precision flag; must be bfloat16 or float32.') tf.logging.info('Precision: %s', FLAGS.precision) batch_size_per_shard = FLAGS.train_batch_size // FLAGS.num_shards params = { 'model_transpose_dims': [0, 1, 2, 3], 'pipeline_transpose_dims': [0, 1, 2, 3], } batch_axis = 0 if FLAGS.transpose_enabled: # On the TPU, convolutions are executed with a different leading # dimension when batch size per shard is less than 64. By # default images are loaded in NHWC order. For optimal performance, # we want to use CHWN order while training wjem batch size per # worker is smaller than 64 per shard. if batch_size_per_shard >= 64: params['model_transpose_dims'] = [3, 0, 1, 2] params['pipeline_transpose_dims'] = [1, 2, 3, 0] batch_axis = 3 else: params['model_transpose_dims'] = [2, 0, 1, 3] params['pipeline_transpose_dims'] = [1, 2, 0, 3] batch_axis = 2 if FLAGS.eval_total_size > 0: eval_size = FLAGS.eval_total_size else: eval_size = _NUM_EVAL_IMAGES eval_steps = eval_size // FLAGS.eval_batch_size iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations) eval_batch_size = (None if FLAGS.mode == 'train' else FLAGS.eval_batch_size) per_host_input_for_training = (FLAGS.num_shards <= 8 if FLAGS.mode == 'train' else True) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, save_summary_steps=FLAGS.save_summary_steps, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=iterations, num_shards=FLAGS.num_shards, per_host_input_for_training=per_host_input_for_training)) inception_classifier = tf.contrib.tpu.TPUEstimator( model_fn=inception_model_fn, use_tpu=FLAGS.use_tpu, config=run_config, params=params, train_batch_size=FLAGS.train_batch_size, eval_batch_size=eval_batch_size, batch_axis=(batch_axis, 0)) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. use_bfloat16 = FLAGS.precision == 'bfloat16' imagenet_train = InputPipeline(is_training=True, data_dir=FLAGS.data_dir, use_bfloat16=use_bfloat16) imagenet_eval = InputPipeline(is_training=False, data_dir=FLAGS.data_dir, use_bfloat16=use_bfloat16) if FLAGS.moving_average: eval_hooks = [LoadEMAHook(FLAGS.model_dir)] else: eval_hooks = [] if FLAGS.mode == 'eval': # Run evaluation when there is a new checkpoint for checkpoint in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # Includes compilation time eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks, checkpoint_path=checkpoint) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(checkpoint).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', checkpoint) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval): tf.logging.info('Starting training cycle %d.' % cycle) inception_classifier.train(input_fn=imagenet_train.input_fn, steps=FLAGS.train_steps_per_eval) tf.logging.info('Starting evaluation cycle %d .' % cycle) eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) else: tf.logging.info('Starting training ...') inception_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps) if FLAGS.export_dir is not None: tf.logging.info('Starting to export model.') inception_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=image_serving_input_fn)
def main(argv): del argv # Unused. tf.enable_resource_variables() tf.set_random_seed(FLAGS.seed) set_lr_schedule() set_custom_sparsity_map() folder_stub = os.path.join(FLAGS.training_method, str(FLAGS.end_sparsity), str(FLAGS.maskupdate_begin_step), str(FLAGS.maskupdate_end_step), str(FLAGS.maskupdate_frequency), str(FLAGS.drop_fraction), str(FLAGS.label_smoothing), str(FLAGS.weight_decay)) output_dir = FLAGS.output_dir if FLAGS.use_folder_stub: output_dir = os.path.join(output_dir, folder_stub) export_dir = os.path.join(output_dir, 'export_dir') # we pass the updated eval and train string to the params dictionary. params = {} params['output_dir'] = output_dir params['training_method'] = FLAGS.training_method params['use_tpu'] = FLAGS.use_tpu dataset_func = functools.partial( imagenet_input.ImageNetInput, data_dir=FLAGS.data_directory, transpose_input=False, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=False) imagenet_train, imagenet_eval = [ dataset_func(is_training=is_training) for is_training in [True, False] ] run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=output_dir, save_checkpoints_steps=FLAGS.steps_per_checkpoint, keep_checkpoint_max=FLAGS.keep_checkpoint_max, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, tpu_job_name=FLAGS.tpu_job_name)) classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn_w_pruning, params=params, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) cpu_classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn_w_pruning, params=params, config=run_config, train_batch_size=FLAGS.train_batch_size, export_to_tpu=False, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.num_eval_images % FLAGS.eval_batch_size != 0: raise ValueError( 'eval_batch_size (%d) must evenly divide num_eval_images(%d)!' % (FLAGS.eval_batch_size, FLAGS.num_eval_images)) eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size if FLAGS.mode == 'eval_once': ckpt_path = os.path.join(output_dir, FLAGS.eval_once_ckpt_prefix) dataset = imagenet_train if FLAGS.eval_on_train else imagenet_eval classifier.evaluate(input_fn=dataset.input_fn, steps=eval_steps, checkpoint_path=ckpt_path, name='{0}'.format(FLAGS.eval_once_ckpt_prefix)) elif FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator(output_dir): tf.logging.info('Starting to evaluate.') try: dataset = imagenet_train if FLAGS.eval_on_train else imagenet_eval classifier.evaluate(input_fn=dataset.input_fn, steps=eval_steps, checkpoint_path=ckpt, name='eval') # Terminate eval job when final checkpoint is reached global_step = int(os.path.basename(ckpt).split('-')[1]) if global_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % global_step) break except tf.errors.NotFoundError: logging('Checkpoint no longer exists,skipping checkpoint.') else: global_step = estimator._load_global_step_from_checkpoint_dir( output_dir) # Session run hooks to export model for prediction export_hook = ExportModelHook(cpu_classifier, export_dir) hooks = [export_hook] if FLAGS.mode == 'train': tf.logging.info('start training...') classifier.train(input_fn=imagenet_train.input_fn, hooks=hooks, max_steps=FLAGS.train_steps) else: assert FLAGS.mode == 'train_and_eval' tf.logging.info('start training and eval...') while global_step < FLAGS.train_steps: next_checkpoint = min(global_step + FLAGS.steps_per_eval, FLAGS.train_steps) classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) global_step = next_checkpoint logging('Completed training up to step :', global_step) classifier.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps)
def main(unused_argv): tf.flags.mark_flag_as_required('model_dir') tf.flags.mark_flag_as_required('pipeline_config_path') if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError('You must specify either --master or --tpu_name.') if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn('Both --master and --tpu_name are set. Ignoring ' '--tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.python.training.TPUClusterResolver( tpu_names=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards)) params = {} (estimator, train_input_fn, eval_validation_input_fn, eval_training_input_fn, train_steps, eval_steps) = ( create_estimator( config, model_hparams.create_hparams( hparams_overrides=FLAGS.hparams_overrides), FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, eval_steps=FLAGS.num_eval_steps, train_batch_size=FLAGS.train_batch_size, use_tpu=FLAGS.use_tpu, num_shards=FLAGS.num_shards, params=params)) if FLAGS.mode in ['train', 'train_and_eval']: estimator.train(input_fn=train_input_fn, max_steps=train_steps) if FLAGS.mode == 'train_and_eval': # Eval one time. eval_results = estimator.evaluate( input_fn=eval_validation_input_fn, steps=eval_steps) tf.logging.info('Eval results: %s' % eval_results) # Continuously evaluating. if FLAGS.mode == 'eval': def terminate_eval(): tf.logging.info('Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout_secs) return True # Run evaluation when there's a new checkpoint. for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval_secs, timeout=FLAGS.eval_timeout_secs, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') if FLAGS.eval_training_data: name = 'training_data' input_fn = eval_training_input_fn else: name = 'validation_data' input_fn = eval_validation_input_fn try: eval_results = estimator.evaluate( input_fn=input_fn, steps=eval_steps, checkpoint_path=ckpt, name=name) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt)
def main(unused_argv): if FLAGS.use_tpu: if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError( "You must specify either --master or --tpu_name.") if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn( "Both --master and --tpu_name are set. Ignoring " "--tpu_name and using --master.") tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() else: # URL is unused if running locally without TPU tpu_grpc_url = None batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size steps_per_checkpoint = FLAGS.steps_per_checkpoint iterations_per_loop = FLAGS.iterations_per_loop eval_steps = _NUM_EVAL_IMAGES // FLAGS.eval_batch_size if iterations_per_loop is None or steps_per_checkpoint < iterations_per_loop: iterations_per_loop = steps_per_checkpoint if FLAGS.mode == "eval": iterations_per_loop = eval_steps params = { "batches_per_epoch": batches_per_epoch, } config = tpu_config.RunConfig(master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_steps=steps_per_checkpoint, log_step_count_steps=iterations_per_loop, tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_shards)) densenet_estimator = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, params=params) if FLAGS.mode == "train": tf.logging.info( "Training for %d steps (%.2f epochs in total)." % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch)) densenet_estimator.train(input_fn=ImageNetInput(True), max_steps=FLAGS.train_steps) elif FLAGS.mode == "train_and_eval": current_step = 0 tf.logging.info( "Training for %d steps (%.2f epochs in total). Current " "step %d" % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch, current_step)) while current_step < FLAGS.train_steps: next_checkpoint = min(current_step + steps_per_checkpoint, FLAGS.train_steps) num_steps = next_checkpoint - current_step current_step = next_checkpoint densenet_estimator.train(input_fn=ImageNetInput(True), steps=num_steps) tf.logging.info("Starting to evaluate.") eval_results = densenet_estimator.evaluate( input_fn=ImageNetInput(False), steps=_NUM_EVAL_IMAGES // FLAGS.eval_batch_size) tf.logging.info("Eval results: %s" % eval_results) else: def terminate_eval(): tf.logging.info( "Terminating eval after %d seconds of no checkpoints" % FLAGS.eval_timeout) return True # Run evaluation when there"s a new checkpoint # If the evaluation worker is delayed in processing a new checkpoint, # the checkpoint file may be deleted by the trainer before it can be # evaluated. # Ignore the error in this case. for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info("Starting to evaluate.") try: eval_results = densenet_estimator.evaluate( input_fn=ImageNetInput(False), steps=eval_steps, checkpoint_path=ckpt) tf.logging.info("Eval results: %s" % eval_results) except tf.errors.NotFoundError: tf.logging.info( "Checkpoint %s no longer exists, skipping checkpoint")
def main(unused_argv): params = hyperparameters.get_hyperparameters(FLAGS.default_hparams_file, FLAGS.hparams_file, FLAGS, FLAGS.hparams) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if params['use_async_checkpointing']: save_checkpoints_steps = None else: save_checkpoints_steps = max(2500, params['iterations_per_loop']) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=get_model_dir(params), save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=None, # Keep all checkpoints. log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=params['iterations_per_loop'], num_shards=params['num_cores'], # copybara:strip_begin tpu_job_name=FLAGS.tpu_job_name, # copybara:strip_end per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=params['use_tpu'], model_fn=resnet_model_fn, config=config, params=params, train_batch_size=params['train_batch_size'], eval_batch_size=params['eval_batch_size'], export_to_tpu=FLAGS.export_to_tpu) # copybara:strip_begin if FLAGS.xla_compile: resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=params['use_tpu'], model_fn=xla.estimator_model_fn(resnet_model_fn), config=config, params=params, train_batch_size=params['train_batch_size'], eval_batch_size=params['eval_batch_size'], export_to_tpu=FLAGS.export_to_tpu) # copybara:strip_end assert (params['precision'] == 'bfloat16' or params['precision'] == 'float32'), ('Invalid value for precision parameter; ' 'must be bfloat16 or float32.') tf.logging.info('Precision: %s', params['precision']) use_bfloat16 = params['precision'] == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train = imagenet_input.ImageNetBigtableInput( is_training=True, use_bfloat16=use_bfloat16, transpose_input=params['transpose_input'], selection=select_train) imagenet_eval = imagenet_input.ImageNetBigtableInput( is_training=False, use_bfloat16=use_bfloat16, transpose_input=params['transpose_input'], selection=select_eval) else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params['transpose_input'], cache=params['use_cache'] and is_training, image_size=params['image_size'], num_parallel_calls=params['num_parallel_calls'], use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = params['num_train_images'] // params['train_batch_size'] eval_steps = params['num_eval_images'] // params['eval_batch_size'] if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( get_model_dir(params), timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params['train_steps']: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'eval_igt': # IGT evaluation mode. Evaluate metrics for the desired parameters # (true or shifted) on the desired dataset (train or eval). Note that # train is still with data augmentation. # Get checkpoint file names. index_files = tf.gfile.Glob( os.path.join(get_model_dir(params), 'model.ckpt-*.index')) checkpoints = [fn[:-len('.index')] for fn in index_files] # Need to sort them to get proper tensorboard plotting (increasing event # timestamps correspond to increasing steps). checkpoint_steps = [] for ckpt in checkpoints: tf.logging.info(ckpt) step_match = re.match(r'.*model.ckpt-([0-9]*)', ckpt) checkpoint_steps.append(int(step_match.group(1))) checkpoints = [ ckpt for _, ckpt in sorted(zip(checkpoint_steps, checkpoints)) ] tf.logging.info('There are {} checkpoints'.format(len(checkpoints))) tf.logging.info(', '.join(checkpoints)) # Keep track of the last processed checkpoint (fault tolerance). analysis_state_path = os.path.join( get_model_dir(params), 'analysis_state_' + FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode) next_analysis_index = 0 if tf.gfile.Exists(analysis_state_path): with tf.gfile.Open(analysis_state_path) as fd: next_analysis_index = int(fd.read()) # Process each checkpoint. while next_analysis_index < len(checkpoints): tf.logging.info( 'Next analysis index: {}'.format(next_analysis_index)) ckpt_path = checkpoints[next_analysis_index] tf.logging.info('Starting to evaluate: {}.'.format(ckpt_path)) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.igt_eval_set == 'train': the_input_fn = imagenet_train.input_fn the_steps = steps_per_epoch elif FLAGS.igt_eval_set == 'eval': the_input_fn = imagenet_eval.input_fn the_steps = eval_steps else: raise ValueError('Unsupported igt_eval_set') eval_results = resnet_classifier.evaluate( input_fn=the_input_fn, steps=the_steps, checkpoint_path=ckpt_path, name=FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) next_analysis_index += 1 file_io.atomic_write_string_to_file(analysis_state_path, str(next_analysis_index)) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( get_model_dir(params)) # pylint:disable=protected-access,g-line-too-long steps_per_epoch = params['num_train_images'] // params[ 'train_batch_size'] tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params['train_steps'], params['train_steps'] / steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params['use_async_checkpointing']: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=get_model_dir(params), save_steps=max(2500, params['iterations_per_loop']))) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=params['train_steps'], hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < params['train_steps']: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params['train_steps']) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=params['num_eval_images'] // params['eval_batch_size']) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', params['train_steps'], elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') unused_export_path = resnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn )
def main(unused_argv): input_image_size = FLAGS.input_image_size if not input_image_size: if FLAGS.model_name.startswith('efficientnet-edgetpu'): _, _, input_image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params( FLAGS.model_name) elif FLAGS.model_name.startswith('efficientnet-tpu'): _, _, input_image_size, _ = efficientnet_tpu_builder.efficientnet_tpu_params( FLAGS.model_name) elif FLAGS.model_name.startswith('efficientnet'): _, _, input_image_size, _ = efficientnet_builder.efficientnet_params( FLAGS.model_name) else: raise ValueError( 'input_image_size must be set except for EfficientNet') # For imagenet dataset, include background label if number of output classes # is 1001 include_background_label = (FLAGS.num_label_classes == 1001) if FLAGS.tpu or FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long # Initializes model parameters. params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, use_bfloat16=FLAGS.use_bfloat16) est = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. def build_imagenet_input(is_training): """Generate ImageNetInput for training and eval.""" if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() return imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=FLAGS.use_bfloat16, transpose_input=FLAGS.transpose_input, selection=select_train if is_training else select_eval, include_background_label=include_background_label, autoaugment_name=FLAGS.autoaugment_name) else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) return imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=input_image_size, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=FLAGS.use_bfloat16, include_background_label=include_background_label, autoaugment_name=FLAGS.autoaugment_name) imagenet_train = build_imagenet_input(is_training=True) imagenet_eval = build_imagenet_input(is_training=False) if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = est.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) est.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) est.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = est.evaluate(input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(est, FLAGS.export_dir, input_image_size)
def main(unused_argv): # [START tpu-cluster-revolver] tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=max(600, FLAGS.iterations_per_loop), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2)) # pylint: disable=line-too-long # [END tpu-cluster-revolver] resnet_classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', ( 'Invalid value for --precision flag; must be bfloat16 or float32.') tf.logging.info('Precision: %s', FLAGS.precision) use_bfloat16 = FLAGS.precision == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train, imagenet_eval = [imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, use_bfloat16=use_bfloat16) for is_training in [True, False]] if FLAGS.mode == 'eval': eval_steps = NUM_EVAL_IMAGES // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d' % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir) # pylint: disable=protected-access,line-too-long batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size tf.logging.info('Training for %d steps (%.2f epochs in total). Current' ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch, current_step)) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == 'train': resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be consistently excluded modulo the batch size. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=NUM_EVAL_IMAGES // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.' % (FLAGS.train_steps, elapsed_time)) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn)