def main(_): mode = FLAGS.mode data_dir = FLAGS.data_dir model_dir = FLAGS.model_dir hparams = build_hparams() estimator_parmas = {} train_steps_per_epoch = int( math.ceil(hparams.num_train_images / float(hparams.train_batch_size))) eval_steps = hparams.num_eval_images // hparams.eval_batch_size eval_batch_size = (None if mode == 'train' else hparams.eval_batch_size) model = model_lib.AmoebaNetEstimatorModel(hparams, model_dir) if hparams.use_tpu: run_config = build_run_config() # Temporary treatment until flags are released. image_classifier = contrib_tpu.TPUEstimator( model_fn=model.model_fn, use_tpu=True, config=run_config, params=estimator_parmas, predict_batch_size=eval_batch_size, train_batch_size=hparams.train_batch_size, eval_batch_size=eval_batch_size, export_to_tpu=FLAGS.export_to_tpu) else: save_checkpoints_steps = (FLAGS.save_checkpoints_steps or FLAGS.iterations_per_loop) run_config = tf.estimator.RunConfig( model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps) image_classifier = tf.estimator.Estimator(model_fn=model.model_fn, config=run_config, params=estimator_parmas) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = model_lib.InputPipeline(is_training=True, data_dir=data_dir, hparams=hparams) imagenet_eval = model_lib.InputPipeline(is_training=False, data_dir=data_dir, hparams=hparams) if hparams.moving_average_decay < 1: eval_hooks = [ model_lib.LoadEMAHook(model_dir, hparams.moving_average_decay) ] else: eval_hooks = [] if mode == 'eval': for checkpoint in _get_next_checkpoint(): tf.logging.info('Starting to evaluate.') try: eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks, checkpoint_path=checkpoint) tf.logging.info('Evaluation results: %s' % eval_results) except tf.errors.NotFoundError: # skip checkpoint if it gets deleted prior to evaluation tf.logging.info('Checkpoint %s no longer exists ... skipping') elif mode == 'train_and_eval': current_step = _load_global_step_from_checkpoint_dir(model_dir) tf.logging.info('Starting training at step=%d.' % current_step) train_steps_per_eval = int(hparams.num_epochs_per_eval * train_steps_per_epoch) # Final Evaluation if training is finished. if current_step >= hparams.num_epochs * train_steps_per_epoch: eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) while current_step < hparams.num_epochs * train_steps_per_epoch: image_classifier.train(input_fn=imagenet_train.input_fn, steps=train_steps_per_eval) current_step += train_steps_per_eval tf.logging.info('Starting evaluation at step=%d.' % current_step) eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) elif mode == 'predict': for checkpoint in _get_next_checkpoint(): tf.logging.info('Starting prediction ...') time_hook = model_lib.SessionTimingHook() eval_hooks.append(time_hook) result_iter = image_classifier.predict( input_fn=imagenet_eval.input_fn, hooks=eval_hooks, checkpoint_path=checkpoint, yield_single_examples=False) results = list(itertools.islice(result_iter, eval_steps)) tf.logging.info('Inference speed = {} images per second.'.format( time_hook.compute_speed(len(results) * eval_batch_size))) elif mode == 'train': current_step = _load_global_step_from_checkpoint_dir(model_dir) total_step = int(hparams.num_epochs * train_steps_per_epoch) if current_step < total_step: tf.logging.info('Starting training ...') image_classifier.train(input_fn=imagenet_train.input_fn, steps=total_step - current_step) else: tf.logging.info('Mode not found.') if FLAGS.export_dir is not None: tf.logging.info('Starting exporting saved model ...') serving_shape = [hparams.image_size, hparams.image_size, 3] export_path = image_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=build_image_serving_input_receiver_fn( serving_shape), as_text=True) if FLAGS.add_warmup_requests: inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, hparams.image_size, batch_sizes=FLAGS.inference_batch_sizes)
def main(_): mode = FLAGS.mode data_dir = FLAGS.data_dir model_dir = FLAGS.model_dir hparams = build_hparams() estimator_parmas = {} train_steps_per_epoch = int( math.ceil(model_lib.NUM_TRAIN_IMAGES / float(hparams.train_batch_size))) eval_steps = model_lib.NUM_EVAL_IMAGES // hparams.eval_batch_size eval_batch_size = (None if mode == 'train' else hparams.eval_batch_size) run_config = build_run_config() model = model_lib.AmoebaNetEstimatorModel(hparams, model_dir) if hparams.use_tpu: image_classifier = tpu_estimator.TPUEstimator( model_fn=model.model_fn, use_tpu=True, config=run_config, params=estimator_parmas, train_batch_size=hparams.train_batch_size, eval_batch_size=eval_batch_size) else: image_classifier = tf.estimator.Estimator(model_fn=model.model_fn, config=run_config, params=estimator_parmas) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = model_lib.InputPipeline(is_training=True, data_dir=data_dir, hparams=hparams) imagenet_eval = model_lib.InputPipeline(is_training=False, data_dir=data_dir, hparams=hparams) if hparams.moving_average_decay < 1: eval_hooks = [ model_lib.LoadEMAHook(model_dir, hparams.moving_average_decay) ] else: eval_hooks = [] if mode == 'eval': for checkpoint in _get_next_checkpoint(): tf.logging.info('Starting to evaluate.') try: eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks, checkpoint_path=checkpoint) tf.logging.info('Evaluation results: %s' % eval_results) except tf.errors.NotFoundError: # skip checkpoint if it gets deleted prior to evaluation tf.logging.info('Checkpoint %s no longer exists ... skipping') elif mode == 'train_and_eval': current_step = _load_global_step_from_checkpoint_dir(model_dir) tf.logging.info('Starting training at step=%d.' % current_step) train_steps_per_eval = hparams.num_epochs_per_eval * train_steps_per_epoch # Final Evaluation if training is finished. if current_step >= hparams.num_epochs * train_steps_per_epoch: eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) while current_step < hparams.num_epochs * train_steps_per_epoch: image_classifier.train(input_fn=imagenet_train.input_fn, steps=train_steps_per_eval) current_step += train_steps_per_eval tf.logging.info('Starting evaluation at step=%d.' % current_step) eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) else: current_step = _load_global_step_from_checkpoint_dir(model_dir) total_step = hparams.num_epochs * train_steps_per_epoch if current_step < total_step: tf.logging.info('Starting training ...') image_classifier.train(input_fn=imagenet_train.input_fn, steps=total_step - current_step)
def main(_): mode = FLAGS.mode data_dir = FLAGS.data_dir model_dir = FLAGS.model_dir hparams = build_hparams() estimator_parmas = {} train_steps_per_epoch = int( math.ceil(hparams.num_train_images / float(hparams.train_batch_size))) eval_steps = hparams.num_eval_images // hparams.eval_batch_size eval_batch_size = (None if mode == 'train' else hparams.eval_batch_size) model = slice_model_lib.AmoebaNetEstimatorModel(hparams, model_dir) save_checkpoints_steps = (FLAGS.save_checkpoints_steps or FLAGS.iterations_per_loop) prepare_tf_config() # rewrite_options = rewriter_config_pb2.RewriterConfig( # layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF) # graph_options = config_pb2.GraphOptions(rewrite_options=rewrite_options) session_config = tf.ConfigProto( # graph_options=graph_options, allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions(allow_growth=True)) if FLAGS.cross_pipeline: cluster_manager = cluster_utils.get_cluster_manager( config_proto=session_config) run_config = tf.estimator.RunConfig( log_step_count_steps=100, session_config=session_config, save_checkpoints_steps=save_checkpoints_steps) image_classifier = tf.estimator.Estimator(model_fn=model.model_fn, config=run_config, params=estimator_parmas) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = model_lib.InputPipeline(is_training=True, data_dir=data_dir, hparams=hparams) imagenet_eval = model_lib.InputPipeline(is_training=False, data_dir=data_dir, hparams=hparams) if hparams.moving_average_decay < 1: eval_hooks = [ model_lib.LoadEMAHook(model_dir, hparams.moving_average_decay) ] else: eval_hooks = [] if mode == 'eval': for checkpoint in _get_next_checkpoint(): tf.logging.info('Starting to evaluate.') try: eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks, checkpoint_path=checkpoint) tf.logging.info('Evaluation results: %s' % eval_results) except tf.errors.NotFoundError: # skip checkpoint if it gets deleted prior to evaluation tf.logging.info('Checkpoint %s no longer exists ... skipping') elif mode == 'train_and_eval': current_step = _load_global_step_from_checkpoint_dir(model_dir) tf.logging.info('Starting training at step=%d.' % current_step) train_steps_per_eval = int(hparams.num_epochs_per_eval * train_steps_per_epoch) # Final Evaluation if training is finished. if current_step >= hparams.num_epochs * train_steps_per_epoch: eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) while current_step < hparams.num_epochs * train_steps_per_epoch: image_classifier.train(input_fn=imagenet_train.input_fn, steps=train_steps_per_eval) current_step += train_steps_per_eval tf.logging.info('Starting evaluation at step=%d.' % current_step) eval_results = image_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) elif mode == 'predict': for checkpoint in _get_next_checkpoint(): tf.logging.info('Starting prediction ...') time_hook = model_lib.SessionTimingHook() eval_hooks.append(time_hook) result_iter = image_classifier.predict( input_fn=imagenet_eval.input_fn, hooks=eval_hooks, checkpoint_path=checkpoint, yield_single_examples=False) results = list(itertools.islice(result_iter, eval_steps)) tf.logging.info('Inference speed = {} images per second.'.format( time_hook.compute_speed(len(results) * eval_batch_size))) elif mode == 'train': current_step = _load_global_step_from_checkpoint_dir(model_dir) total_step = int(hparams.num_epochs * train_steps_per_epoch) if current_step < total_step: tf.logging.info('Starting training ...') image_classifier.train(input_fn=imagenet_train.input_fn, steps=min(total_step - current_step, FLAGS.max_steps)) else: tf.logging.info('Mode not found.')