def get_estimator(model_dir, resolution): tpu_cluster_resolver = None if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, tpu_config=tpu_config.TPUConfig( num_shards=FLAGS.num_shards, iterations_per_loop=FLAGS.iterations_per_loop)) est = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=config, params={ "data_dir": FLAGS.data_dir, "resolution": resolution }, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size) local_est = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=False, config=config, params={ "data_dir": FLAGS.data_dir, "resolution": resolution }, predict_batch_size=FLAGS.num_eval_images) else: est = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, params={ "data_dir": FLAGS.data_dir, "batch_size": FLAGS.batch_size, "resolution": resolution }) local_est = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, params={ "data_dir": FLAGS.data_dir, "batch_size": FLAGS.num_eval_images, "resolution": resolution }) return est, local_est
def main(_): # define tpu_grpc_url = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=["demo-tpu"]).get_master() model_dir = os.path.join(FLAGS.out_dir, str(int(time.time()))) + "/" run_config = tpu_config.RunConfig( master=tpu_grpc_url, model_dir=model_dir, save_checkpoints_secs=3600, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=100, num_shards=FLAGS.num_replica ) # when you skip num_shards, 8 is used. ) cifar10_resnet_classifier = tpu_estimator.TPUEstimator( model_fn=_my_model_fn, use_tpu=True, config=run_config, train_batch_size=batch_size) # run ! cifar10_resnet_classifier.train( input_fn=_my_input_fn, #max_steps=50000 * 10 // batch_size) # Full spec max_steps=5000) # For benchmarking
def create_estimator(master, model_dir, use_tpu, iterations_per_loop, num_shards, model_params, include_features_in_predictions=True, decode_keys=(), train_init_checkpoint=None, train_warmup_steps=10000, save_checkpoints_steps=1000, keep_checkpoint_max=5): """Returns an tensorflow estimator.""" run_config = tpu_config.RunConfig( master=master, model_dir=model_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig(iterations_per_loop), save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=keep_checkpoint_max) return tpu_estimator.TPUEstimator( model_fn=_estimator_model_fn(use_tpu, model_params, model_dir, include_features_in_predictions, decode_keys, train_init_checkpoint, train_warmup_steps), use_tpu=use_tpu, train_batch_size=model_params.batch_size * num_shards, eval_batch_size=model_params.batch_size * num_shards, predict_batch_size=model_params.batch_size * num_shards, config=run_config)
def construct_estimator(model_fn, hparams, tpu=None): if hparams.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=tpu.name) master = tpu_cluster_resolver.get_master() config = tpu_config.RunConfig( master=master, evaluation_master=master, model_dir=hparams.output_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.tpu_iterations_per_loop, num_shards=FLAGS.tpu_shards), save_checkpoints_steps=FLAGS.eval_every) estimator = tpu_estimator.TPUEstimator( use_tpu=hparams.use_tpu, model_fn=model_fn, model_dir=hparams.output_dir, config=config, train_batch_size=hparams.batch_size, eval_batch_size=hparams.batch_size) else: gpu_config = tf.ConfigProto(allow_soft_placement=True) gpu_config.gpu_options.allow_growth = True run_config = tf.estimator.RunConfig( save_checkpoints_steps=FLAGS.eval_every, session_config=gpu_config) estimator = tf.estimator.Estimator( model_fn=tf.contrib.estimator.replicate_model_fn(model_fn), model_dir=hparams.output_dir, config=run_config) return estimator
def run_training(hparams): """For benchmarking convenience, run only the training job.""" model_module = { MATRIX_FACTORIZATION: matrix_factorization_model, DNN_SOFTMAX: dnn_softmax_model }[hparams.model_type] features_padding_fn, model_fn, target_features_fn = ( model_module.get_pad_and_model_fns(hparams)) estimator = tpu_estimator.TPUEstimator( model_dir=hparams.output_path, model_fn=model_fn, train_batch_size=hparams.batch_size, use_tpu=hparams.use_tpu, config=tpu_config.RunConfig(master=hparams.master, tpu_config=tpu_config.TPUConfig( hparams.tpu_loop_steps, num_shards=hparams.tpu_cores))) train_data_paths = os.path.join(hparams.train_data_dir, 'features_train-*') train_input_fn = make_input_fn(hparams=hparams, mode=tf.contrib.learn.ModeKeys.TRAIN, data_file_pattern=train_data_paths, features_padding_fn=features_padding_fn, target_features_fn=target_features_fn, randomize_input=hparams.randomize_input, queue_capacity=4 * hparams.batch_size) estimator.train(input_fn=train_input_fn, steps=hparams.train_steps)
def main(unused_argv): del unused_argv start = time.time() tf.logging.set_verbosity(tf.logging.INFO) run_config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), #tpu_config=tpu_config.TPUConfig(5, FLAGS.num_shards, per_host_input_for_training = True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=128, eval_batch_size=128, config=run_config) estimator.train(input_fn=get_input_fn(FLAGS.train_file), max_steps=FLAGS.train_steps) estimator.evaluate(input_fn=get_input_fn(FLAGS.eval_file), steps=100) total = time.time() - start print("Total time: " + str(total))
def main(argv): del argv # Unused. if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError("You must specify either --master or --tpu_name.") if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn("Both --master and --tpu_name are set. Ignoring " "--tpu_name and using --master.") tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() run_config = tpu_config.RunConfig( master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_secs=3600, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size) estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
def main(unused_argv): config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards)) resnet_classifier = tpu_estimator.TPUEstimator( model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.enable_eval: for cycle in range(FLAGS.train_steps // FLAGS.steps_per_eval): tf.logging.info('Starting a training cycle.') resnet_classifier.train( input_fn=ImageNetInput(True), steps=FLAGS.steps_per_eval) _EVAL_STEPS = 50000 // FLAGS.eval_batch_size tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=ImageNetInput(False), steps=_EVAL_STEPS) tf.logging.info('Eval results: %s' % eval_results) else: tf.logging.info('Starting training.') resnet_classifier.train( input_fn=ImageNetInput(True), steps=FLAGS.train_steps)
def run_toy_model_tpu(): """Run a toy model on TPU.""" iterations_per_loop = FLAGS.iterations mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_steps=None, # Disable the default saver save_checkpoints_secs=None, # Disable the default saver log_step_count_steps=iterations_per_loop, tpu_config=tpu_config.TPUConfig( num_shards=mesh_shape.size, iterations_per_loop=iterations_per_loop, num_cores_per_replica=1, per_host_input_for_training=tpu_config.InputPipelineConfig.BROADCAST)) classifier = tpu_estimator.TPUEstimator( use_tpu=True, model_fn=model_fn, config=config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size) current_step = estimator_lib._load_global_step_from_checkpoint_dir(FLAGS.model_dir) # pylint: disable=protected-access,line-too-long logging.info('Current step %d', current_step) while current_step < FLAGS.train_steps: next_checkpoint = min(current_step + FLAGS.steps_per_checkpoint, FLAGS.train_steps) classifier.train(input_fn=ToyModelInput(), max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info('Starting to evaluate.') eval_results = classifier.evaluate( input_fn=ToyModelInput(), steps=156) # since we have 10000 examples and batch_size = 64 per host logging.info('Eval results: %s', eval_results)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.train_file: tf.logging.fatal( "Flag --train_file must be set for training. Aborting.") if FLAGS.eval_steps and not FLAGS.eval_file: tf.logging.fatal( "Flag --eval_file must be set for evaluation. Aborting.") run_config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, config=run_config) estimator.train(input_fn=get_input_fn(FLAGS.train_file), max_steps=FLAGS.train_steps) if FLAGS.eval_steps: estimator.evaluate(input_fn=get_input_fn(FLAGS.eval_file), steps=FLAGS.eval_steps)
def _get_tpu_estimator(): tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=None, project=None) tpu_grpc_url = tpu_cluster_resolver.get_master() run_config = contrib_tpu_python_tpu_tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.work_dir, save_checkpoints_steps=max(1000, FLAGS.iterations_per_loop), save_summary_steps=FLAGS.summary_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=contrib_tpu_python_tpu_tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=contrib_tpu_python_tpu_tpu_config. InputPipelineConfig.PER_HOST_V2)) return contrib_tpu_python_tpu_tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores, eval_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores, params=FLAGS.flag_values_dict())
def main(argv): del argv # Unused. if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() else: tpu_grpc_url = None run_config = tpu_config.RunConfig( master=tpu_grpc_url, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size) estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
def main(unused_argv): if FLAGS.use_tpu: # Determine the gRPC URL of the TPU device to use if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError( 'You must specify either --master or --tpu_name.') if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn( 'Both --master and --tpu_name are set. Ignoring' ' --tpu_name and using --master.') tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() else: # URL is unused if running locally without TPU tpu_grpc_url = None config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores)) resnet_classifier = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = imagenet_input.ImageNetInput(is_training=True, data_dir=FLAGS.data_dir) imagenet_eval = imagenet_input.ImageNetInput(is_training=False, data_dir=FLAGS.data_dir) current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size tf.logging.info('Training for %d steps (%.2f epochs in total). Current' ' step %d.' % (FLAGS.train_steps, FLAGS.train_steps / batches_per_epoch, current_step)) #start_timestamp = time.time() #while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. At the end of training, a # checkpoint will be written to --model_dir. # next_checkpoint = min(current_step + FLAGS.steps_per_eval, # FLAGS.train_steps) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps)
def main(argv): del argv training_examples = (FLAGS.train_epochs * 40000) eval_examples = 10000 iterations_per_loop = ((training_examples // 10) // FLAGS.train_batch_size) if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError("You must specify either --master or --tpu_name.") if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn("Both --master and --tpu_name are set. Ignoring " "--tpu_name and using --master.") tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.python.training.TPUClusterResolver( tpu_names=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() run_config = tpu_config.RunConfig( master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.steps_per_checkpoint, log_step_count_steps=iterations_per_loop, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, params=dict(CIFAR_SMALL_PARAMS, use_tpu=FLAGS.use_tpu), ) # Evaluate the test set after 5% of training examples are finished. for cycle in range(10): tf.logging.info("Starting %d train steps" % (training_examples // 10 // FLAGS.train_batch_size)) estimator.train(input_fn=InputReader(FLAGS.train_file, is_training=True), steps=training_examples // 10 // FLAGS.train_batch_size) tf.logging.info("Starting evaluation cycle %d ." % cycle) print( estimator.evaluate( input_fn=InputReader(FLAGS.train_file, is_training=False), steps=eval_examples // FLAGS.eval_batch_size, ))
def main(argv): del argv # Hyperparameters derived from the paper hparams = mobilenet_hparams() hparams.parse(FLAGS.hparams) params = dict( hparams.values(), num_eval_examples=FLAGS.num_eval_examples, num_examples_per_epoch=FLAGS.num_examples_per_epoch, num_shards=FLAGS.num_shards, num_batches_per_epoch=FLAGS.num_examples_per_epoch / FLAGS.batch_size, ) with tf.gfile.GFile(FLAGS.model_dir + "/hparams.json", "w") as f: tf.gfile.MakeDirs(FLAGS.model_dir) f.write(hparams.to_json()) num_training_examples = FLAGS.num_examples_per_epoch * params["num_epochs"] num_eval_batches = FLAGS.num_eval_examples // FLAGS.batch_size num_training_batches = num_training_examples // FLAGS.batch_size run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=100, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(params, use_tpu=FLAGS.use_tpu), ) # Evaluate the test set after each epoch of the training set is processed. for _ in range(FLAGS.num_epochs): tf.logging.info("Training one epoch: %s steps", num_training_batches // FLAGS.num_epochs) estimator.train( input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), steps=num_training_batches // FLAGS.num_epochs) tf.logging.info("Running evaluation") tf.logging.info("%s", estimator.evaluate( input_fn=data_pipeline.InputReader( FLAGS.data_dir, is_training=False), steps=num_eval_batches, ))
def _make_estimator(use_tpu, model_dir): return tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=use_tpu, config=_make_run_config(model_dir), train_batch_size=num_shards, params=dict(params, use_tpu=use_tpu), )
def main(argv): del argv if FLAGS.master is None and FLAGS.tpu_name is None: raise RuntimeError("You must specify either --master or --tpu_name.") if FLAGS.master is not None: if FLAGS.tpu_name is not None: tf.logging.warn("Both --master and --tpu_name are set. Ignoring " "--tpu_name and using --master.") tpu_grpc_url = FLAGS.master else: tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() training_examples = 1300 * 1000 * FLAGS.num_epochs eval_examples = 50 * 1000 params = { "num_classes": 1001, "lr": FLAGS.learning_rate, "min_lr": 0.005, "momentum": FLAGS.momentum, "optimizer": FLAGS.optimizer, "num_eval_examples": eval_examples, "num_shards": FLAGS.num_shards, "num_epochs": FLAGS.num_epochs, } run_config = tpu_config.RunConfig( master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=squeezenet_model.model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(params, use_tpu=FLAGS.use_tpu), ) #num_evals = max(FLAGS.num_evals, 1) #examples_per_eval = training_examples // num_evals #for _ in range(num_evals): estimator.train( input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), #steps=examples_per_eval // FLAGS.batch_size) steps=FLAGS.train_steps)
def main(unused_argv): del unused_argv start = time.time() tf.logging.set_verbosity(tf.logging.INFO) print('Tensorflow version: ' + str(tf.__version__)) for k, v in iter(tf.app.flags.FLAGS.flag_values_dict().items()): print("***%s: %s" % (k, v)) if FLAGS.use_tpu == True: if FLAGS.tpu_name is None: raise RuntimeError("You must specify --tpu_name.") else: if '1.6.0' in tf.__version__: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( tpu_names=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() else: tpu_grpc_url = '' run_config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_secs=None, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions(allow_growth=True)), tpu_config=tpu_config.TPUConfig(iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, params={ "output_size": output_size, "input_size": input_size }, use_tpu=FLAGS.use_tpu, train_batch_size=batch_size, config=run_config) estimator.train(input_fn=get_input_fn(input_size, output_size), max_steps=FLAGS.train_steps) total_time = time.time() - start example_per_sec = batch_size * FLAGS.train_steps / total_time global_step_per_sec = FLAGS.train_steps / total_time print("Total time: " + str(total_time))
def main(unused_argv): start = time.time() tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.use_tpu: tf.logging.info("Using TPUs.") else: tf.logging.info("NOT using TPUs.") if FLAGS.use_tpu: tf.logging.info('tpu name:', FLAGS.tpu_name) if FLAGS.tpu_name is None: raise RuntimeError("You must specify --tpu_name.") else: if '1.6.0' in tf.__version__: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( tpu_names=[os.uname()[1]], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) else: tpu_cluster_resolver = ( tf.contrib.cluster_resolver.TPUClusterResolver( os.uname()[1], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() else: tpu_grpc_url = '' run_config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_secs=None, tpu_config=tpu_config.TPUConfig(iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, params={ "bs": FLAGS.batch_size, "output_dim": output_dim, "input_dim": input_dim }, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, config=run_config) estimator.train(input_fn=get_input_fn(FLAGS.batch_size, input_dim, output_dim), max_steps=FLAGS.train_steps) total = time.time() - start tf.logging.info("Total time: " + str(total))
def main(argv): del argv training_examples = 1300 * 1000 * FLAGS.num_epochs eval_examples = 50 * 1000 params = { "num_classes": 1001, "lr": 0.04, "min_lr": 0.0004, "momentum": FLAGS.momentum, "optimizer": FLAGS.optimizer, "num_eval_examples": eval_examples, "num_shards": FLAGS.num_shards, "num_epochs": FLAGS.num_epochs, } run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config.TPUConfig( iterations_per_loop=100, num_shards=FLAGS.num_shards, ), ) estimator = tpu_estimator.TPUEstimator( model_fn=squeezenet_model.model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, params=dict(params, use_tpu=FLAGS.use_tpu), ) # Evaluate the test set after 5% of training examples are finished. num_evals = 20 for _ in range(num_evals): estimator.train( input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True), steps=training_examples // (num_evals * FLAGS.batch_size)) tf.logging.info("Running evaluation") tf.logging.info("%s", estimator.evaluate( input_fn=data_pipeline.InputReader( FLAGS.data_dir, is_training=False), steps=eval_examples // FLAGS.batch_size, ))
def train(*tf_records, steps=None): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=None, project=None) tpu_grpc_url = tpu_cluster_resolver.get_master() config = tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, save_checkpoints_steps=max(800, FLAGS.iterations_per_loop), session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=tpu_config.InputPipelineConfig.PER_HOST_V2)) # pylint: disable=line-too-long estimator = tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores, eval_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores) def input_fn(params): return preprocessing.get_tpu_input_tensors(params['batch_size'], tf_records) # TODO: get hooks working again with TPUestimator. hooks = [] else: estimator = get_estimator(FLAGS.model_dir) def input_fn(): return preprocessing.get_input_tensors( FLAGS.train_batch_size, tf_records, filter_amount=1.0, shuffle_buffer_size=FLAGS.shuffle_buffer_size) hooks = [ UpdateRatioSessionHook(FLAGS.model_dir), EchoStepCounterHook(output_dir=FLAGS.model_dir) ] if steps is None: steps = EXAMPLES_PER_GENERATION // FLAGS.train_batch_size print("Training, steps = {}".format(steps)) estimator.train(input_fn, steps=int(steps), hooks=hooks)
def test_fail_with_tpu_estimator(self): def dummy_model_fn(features, labels): del features, labels # unused with self.assertRaisesRegexp( ValueError, '`Experiment` class cannot work with `tf.contrib.tpu.TPUEstimator`' ): experiment.Experiment(tpu_estimator.TPUEstimator( model_fn=dummy_model_fn, config=tpu_config.RunConfig(), train_batch_size=256), train_input_fn='train_input', eval_input_fn='eval_input')
def main(_): """Run training/eval/inference.""" cluster = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=[FLAGS.tpu], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) my_tpu_config = tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_cores_per_replica=1, per_host_input_for_training=tpu_config.InputPipelineConfig.BROADCAST, ) run_config = tpu_config.RunConfig( cluster=cluster, model_dir=FLAGS.model_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=my_tpu_config) estimator = tpu_estimator.TPUEstimator( model_fn=my_model_fn, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, use_tpu=FLAGS.tpu, export_to_tpu=False) def input_fn(params): del params return transformer_dataset.get_dataset( FLAGS.dataset, FLAGS.data_dir or None, train=(FLAGS.mode == "train"), batch_size=FLAGS.batch_size, length=length_from_flags()) if FLAGS.mode == "train": estimator.train( input_fn=input_fn, max_steps=FLAGS.train_steps ) elif FLAGS.mode == "evaluate": estimator.evaluate( input_fn=input_fn, steps=FLAGS.eval_steps, ) elif FLAGS.mode == "infer": decode_from_file(estimator) else: raise ValueError( "unknown mode %s - must be train/evaluate/infer" % FLAGS.mode)
def main(unused_argv): del unused_argv # Unused if FLAGS.input_layout not in ['NCHW', 'NHWC']: raise RuntimeError('--input_layout must be one of [NCHW, NHWC]') run_config = tpu_config.RunConfig( master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, save_summary_steps=FLAGS.save_summary_steps, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards)) inception_classifier = tpu_estimator.TPUEstimator( model_fn=inception_model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, batch_axis=(get_batch_axis( FLAGS.train_batch_size // FLAGS.num_shards), 0)) for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval): # tensors_to_log = { # 'learning_rate': 'learning_rate', # 'prediction_loss': 'prediction_loss', # 'train_accuracy': 'train_accuracy' # } # logging_hook = tf.train.LoggingTensorHook( # tensors=tensors_to_log, every_n_iter=100) tf.logging.info('Starting training cycle %d.' % cycle) inception_classifier.train( input_fn=ImageNetInput(True), steps=FLAGS.train_steps_per_eval) if FLAGS.eval_enabled: eval_steps = (imagenet.get_split_size('validation') // FLAGS.eval_batch_size) tf.logging.info('Starting evaluation cycle %d .' % cycle) eval_results = inception_classifier.evaluate( input_fn=ImageNetInput(False), steps=eval_steps) tf.logging.info('Evaluation results: %s' % eval_results)
def main(unused_argv): tf.logging.set_verbosity(FLAGS.log) hparams = HParams( batch_size=64, rnn_layer_sizes=[64, 64], dropout_keep_prob=0.5, skip_first_n_losses=0, clip_norm=5, initial_learning_rate=0.01, decay_steps=1000, decay_rate=0.95) use_fake_data = not FLAGS.sequence_example_file if not use_fake_data: sequence_example_file_paths = tf.gfile.Glob( os.path.expanduser(FLAGS.sequence_example_file)) tf.logging.info('Using real data from : %s', sequence_example_file_paths) input_fn = input_fn_by_record_files( sequence_example_file_paths, _INPUT_SIZE, padding_length=( FLAGS.static_padding_length if FLAGS.use_static_rnn else None)) else: tf.logging.info('Using fake data') input_fn = input_fn_by_dataset_with_fake_data( _INPUT_SIZE, padding_length=( FLAGS.static_padding_length if FLAGS.use_static_rnn else None)) run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards), ) model_fn = events_rnn_graph.build_model_fn(hparams) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, config=run_config, train_batch_size=hparams.batch_size, use_tpu=FLAGS.use_tpu) estimator.train(input_fn=input_fn, max_steps=FLAGS.num_training_steps)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards)) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size) estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
def _build_estimator(self, is_training): """Returns an Estimator object. Args: is_training: Boolean, whether or not we're in training mode. Returns: A tf.estimator.Estimator. """ config = self._config save_checkpoints_steps = config.logging.checkpoint.save_checkpoints_steps keep_checkpoint_max = self._config.logging.checkpoint.num_to_keep if is_training and config.use_tpu: iterations = config.tpu.iterations num_shards = config.tpu.num_shards run_config = tpu_config.RunConfig( save_checkpoints_secs=None, save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=keep_checkpoint_max, master=FLAGS.master, evaluation_master=FLAGS.master, model_dir=self._logdir, tpu_config=tpu_config.TPUConfig( iterations_per_loop=iterations, num_shards=num_shards, per_host_input_for_training=num_shards <= 8), tf_random_seed=FLAGS.tf_random_seed) batch_size = config.data.batch_size return tpu_estimator.TPUEstimator( model_fn=self._get_model_fn(), config=run_config, use_tpu=True, train_batch_size=batch_size, eval_batch_size=batch_size) else: run_config = tf.estimator.RunConfig().replace( model_dir=self._logdir, save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=keep_checkpoint_max, tf_random_seed=FLAGS.tf_random_seed) return tf.estimator.Estimator( model_fn=self._get_model_fn(), config=run_config)
def main(argv): del argv # Unused. run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=3600, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations, num_shards=FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size) estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.save_checkpoints_secs: if not FLAGS.eval_steps: tf.logging.info( 'If checkpoint is expected, please set --save_checkpoints_secs.' ) else: tf.logging.fatal( 'Flag --save_checkpoints_secs must be set for evaluation. Aborting.' ) if not FLAGS.train_file: tf.logging.fatal( 'Flag --train_file must be set for training. Aborting.') if FLAGS.eval_steps and not FLAGS.eval_file: tf.logging.fatal( 'Flag --eval_file must be set for evaluation. Aborting.') run_config = tpu_config.RunConfig( master=FLAGS.master, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards), ) estimator = tpu_estimator.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size) estimator.train(input_fn=get_input_fn(FLAGS.train_file), max_steps=FLAGS.train_steps) if FLAGS.eval_steps: estimator.evaluate(input_fn=get_input_fn(FLAGS.eval_file), steps=FLAGS.eval_steps)
def testTrainingPipeline(self, training_method): output_directory = '/tmp/' g = tf.Graph() with g.as_default(): dataset = self._retrieve_data(is_training=False, data_dir=False) FLAGS.transpose_input = False FLAGS.use_tpu = False FLAGS.mode = 'train' FLAGS.mask_init_method = 'random' FLAGS.precision = 'float32' FLAGS.train_steps = 1 FLAGS.train_batch_size = 1 FLAGS.eval_batch_size = 1 FLAGS.steps_per_eval = 1 FLAGS.model_architecture = 'resnet' params = {} params['output_dir'] = output_directory params['training_method'] = training_method params['use_tpu'] = False set_lr_schedule() run_config = tpu_config.RunConfig(master=None, model_dir=None, save_checkpoints_steps=1, tpu_config=tpu_config.TPUConfig( iterations_per_loop=1, num_shards=1)) classifier = tpu_estimator.TPUEstimator( use_tpu=False, model_fn=resnet_model_fn_w_pruning, params=params, config=run_config, train_batch_size=1, eval_batch_size=1) classifier.train(input_fn=dataset.input_fn, max_steps=1)