def main(argv): del argv # Hyperparameters derived from the paper hparams = mobilenet_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_eval_examples=FLAGS.num_eval_examples, num_examples_per_epoch=FLAGS.num_examples_per_epoch, num_shards=FLAGS.num_shards, num_batches_per_epoch=FLAGS.num_examples_per_epoch / FLAGS.batch_size, ) with tf.gfile.GFile(FLAGS.model_dir + "/hparams.json", "w") as f: tf.gfile.MakeDirs(FLAGS.model_dir) f.write(hparams.to_json()) num_training_examples = FLAGS.num_examples_per_epoch * params["num_epochs"] num_eval_batches = FLAGS.num_eval_examples // FLAGS.batch_size num_training_batches = num_training_examples // FLAGS.batch_size run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=100, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(params, use_tpu=FLAGS.use_tpu), ) # Evaluate the test set after each epoch of the training set is processed. for _ in range(FLAGS.num_epochs): tf.logging.info("Training one epoch: %s steps", num_training_batches // FLAGS.num_epochs) estimator.train( input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), steps=num_training_batches // FLAGS.num_epochs) tf.logging.info("Running evaluation") tf.logging.info("%s", estimator.evaluate( input_fn=data_pipeline.InputReader( FLAGS.data_dir, is_training=False), steps=num_eval_batches, ))
def main(argv): del argv tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) training_examples = 1300 * 1000 * FLAGS.num_epochs eval_examples = 50 * 1000 params = { "num_classes": 1001, "lr": FLAGS.learning_rate, "min_lr": 0.005, "momentum": FLAGS.momentum, "optimizer": FLAGS.optimizer, "num_eval_examples": eval_examples, "num_shards": FLAGS.num_shards, "num_epochs": FLAGS.num_epochs, } run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=100, num_shards=FLAGS.num_shards, ), ) estimator = tf.contrib.tpu.TPUEstimator( model_fn=squeezenet_model.model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(params, use_tpu=FLAGS.use_tpu), ) num_evals = max(FLAGS.num_evals, 1) examples_per_eval = training_examples // num_evals for _ in range(num_evals): estimator.train(input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), steps=examples_per_eval // FLAGS.batch_size) tf.logging.info("Running evaluation") tf.logging.info( "%s", estimator.evaluate( input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=False), steps=eval_examples // FLAGS.batch_size, ))
def main(argv): del argv training_examples = 1300 * 1000 * FLAGS.num_epochs eval_examples = 50 * 1000 params = { "num_classes": 1001, "lr": 0.04, "min_lr": 0.0004, "momentum": FLAGS.momentum, "optimizer": FLAGS.optimizer, "num_eval_examples": eval_examples, "num_shards": FLAGS.num_shards, "num_epochs": FLAGS.num_epochs, } run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=100, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=squeezenet_model.model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(params, use_tpu=FLAGS.use_tpu), ) # Evaluate the test set after 5% of training examples are finished. num_evals = 20 for _ in range(num_evals): estimator.train( input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), steps=training_examples // (num_evals * FLAGS.batch_size)) tf.logging.info("Running evaluation") tf.logging.info("%s", estimator.evaluate( input_fn=data_pipeline.InputReader( FLAGS.data_dir, is_training=False), steps=eval_examples // FLAGS.batch_size, ))
def train_and_eval(deeplab_estimator, train_dataset, eval_dataset, num_batches_per_epoch): """Interleaves training and evaluation.""" # pylint: disable=protected-access current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) tf.logging.info('Training for %d steps (%.2f epochs in total). Current' ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps / num_batches_per_epoch, current_step)) start_timestamp = time.time() while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. At the end of training, # a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) train_input_fn = data_pipeline.InputReader( train_dataset, FLAGS.train_split, is_training=True, model_variant=FLAGS.model_variant ) deeplab_estimator.train( input_fn=train_input_fn, max_steps=next_checkpoint ) current_step = next_checkpoint elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.' % (current_step, elapsed_time)) tf.logging.info('Starting to evaluate.') eval_input_fn = data_pipeline.InputReader( eval_dataset, FLAGS.eval_split, is_training=False, model_variant=FLAGS.model_variant ) eval_results = deeplab_estimator.evaluate( input_fn=eval_input_fn, steps=eval_dataset.num_samples // FLAGS.eval_batch_size ) tf.logging.info('Eval results: %s' % eval_results)
def main(argv): del argv if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError("You must specify either --master or --tpu_name.") if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn("Both --master and --tpu_name are set. Ignoring " "--tpu_name and using --master.") tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() training_examples = 1300 * 1000 * FLAGS.num_epochs eval_examples = 50 * 1000 params = { "num_classes": 1001, "lr": FLAGS.learning_rate, "min_lr": 0.005, "momentum": FLAGS.momentum, "optimizer": FLAGS.optimizer, "num_eval_examples": eval_examples, "num_shards": FLAGS.num_shards, "num_epochs": FLAGS.num_epochs, } run_config = tpu_config.RunConfig( master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=squeezenet_model.model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(params, use_tpu=FLAGS.use_tpu), ) #num_evals = max(FLAGS.num_evals, 1) #examples_per_eval = training_examples // num_evals #for _ in range(num_evals): estimator.train( input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), #steps=examples_per_eval // FLAGS.batch_size) steps=FLAGS.train_steps)
def main(unused_argv): params = params_dict.ParamsDict(squeezenet_config.SQUEEZENET_CFG, squeezenet_config.SQUEEZENET_RESTRICTIONS) params = params_dict.override_params_dict(params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params = flags_to_params.override_params_from_input_flags(params, FLAGS) total_steps = ( (params.train.num_epochs * params.train.num_examples_per_epoch) // params.train.train_batch_size) params.override( { "train": { "total_steps": total_steps }, "eval": { "num_steps_per_eval": (total_steps // params.eval.num_evals) }, }, is_strict=False) params.validate() params.lock() tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if not params.use_async_checkpointing: save_checkpoints_steps = max(5000, params.train.iterations_per_loop) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=params.model_dir, save_checkpoints_steps=save_checkpoints_steps, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=params.train.iterations_per_loop, num_shards=params.train.num_cores_per_replica, ), ) estimator = contrib_tpu.TPUEstimator( model_fn=squeezenet_model.model_fn, use_tpu=params.use_tpu, config=run_config, train_batch_size=params.train.train_batch_size, eval_batch_size=params.eval.eval_batch_size, params=params.as_dict(), ) for eval_cycle in range(params.eval.num_evals): current_cycle_last_train_step = ((eval_cycle + 1) * params.eval.num_steps_per_eval) estimator.train(input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), steps=current_cycle_last_train_step) tf.logging.info("Running evaluation") tf.logging.info( "%s", estimator.evaluate(input_fn=data_pipeline.InputReader( FLAGS.data_dir, is_training=False), steps=(params.eval.num_eval_examples // params.eval.eval_batch_size)))
def main(unused_argv): train_dataset = segmentation_dataset.get_dataset( FLAGS.dataset_name, FLAGS.train_split, dataset_dir=FLAGS.dataset_dir) eval_dataset = segmentation_dataset.get_dataset( FLAGS.dataset_name, FLAGS.eval_split, dataset_dir=FLAGS.dataset_dir) num_train_images = train_dataset.num_samples num_classes = train_dataset.num_classes ignore_label = train_dataset.ignore_label num_batches_per_epoch = num_train_images / FLAGS.train_batch_size tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards)) params = get_params(ignore_label, num_classes, num_batches_per_epoch) deeplab_estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model.model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, params=params) if FLAGS.mode == 'train': tf.logging.info( 'Training for %d steps (%.2f epochs in total).' % (FLAGS.train_steps, FLAGS.train_steps / num_batches_per_epoch)) train_input_fn = data_pipeline.InputReader( train_dataset, FLAGS.train_split, is_training=True, model_variant=FLAGS.model_variant) deeplab_estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) elif FLAGS.mode == 'train_and_eval': train_and_eval(deeplab_estimator, train_dataset, eval_dataset, num_batches_per_epoch) elif FLAGS.mode == 'eval': eval_input_fn = data_pipeline.InputReader( eval_dataset, FLAGS.eval_split, is_training=False, model_variant=FLAGS.model_variant) # Run evaluation when there's a new checkpoint for ckpt in tf.contrib.training.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: eval_results = deeplab_estimator.evaluate( input_fn=eval_input_fn, steps=eval_dataset.num_samples // FLAGS.eval_batch_size) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) else: tf.logging.error('Mode not found.')
def main(argv): del argv if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError("You must specify either --master or --tpu_name.") if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn("Both --master and --tpu_name are set. Ignoring " "--tpu_name and using --master.") tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.python.training.TPUClusterResolver( tpu_names=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() # Hyperparameters derived from the paper hparams = mobilenet_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_eval_examples=FLAGS.num_eval_examples, num_examples_per_epoch=FLAGS.num_examples_per_epoch, num_shards=FLAGS.num_shards, num_batches_per_epoch=FLAGS.num_examples_per_epoch / FLAGS.batch_size, ) with tf.gfile.GFile(FLAGS.model_dir + "/hparams.json", "w") as f: tf.gfile.MakeDirs(FLAGS.model_dir) f.write(hparams.to_json()) num_training_examples = FLAGS.num_examples_per_epoch * params["num_epochs"] num_eval_batches = FLAGS.num_eval_examples // FLAGS.batch_size num_training_batches = num_training_examples // FLAGS.batch_size run_config = tpu_config.RunConfig( master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=100, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(params, use_tpu=FLAGS.use_tpu), ) # Evaluate the test set after each epoch of the training set is processed. for _ in range(FLAGS.num_epochs): tf.logging.info("Training one epoch: %s steps", num_training_batches // FLAGS.num_epochs) estimator.train( input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), steps=num_training_batches // FLAGS.num_epochs) tf.logging.info("Running evaluation") tf.logging.info("%s", estimator.evaluate( input_fn=data_pipeline.InputReader( FLAGS.data_dir, is_training=False), steps=num_eval_batches, ))