def dcgan_run_optimize(): bench_total_start = time.time() _NUM_VIZ_IMAGES = dcgan_main._NUM_VIZ_IMAGES tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tf.compat.v1.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, tpu_config=tf.compat.v1.estimator.tpu.TPUConfig( num_shards=FLAGS.num_shards, iterations_per_loop=FLAGS.iterations_per_loop)) # Get the generator and discriminator functions depending on which dataset # we want to train on. # model, dataset = { # 'mnist': (mnist_model, mnist_input), # 'cifar': (cifar_model, cifar_input), # }[FLAGS.dataset] model, dataset = { 'mnist': (mnist_model, MNISTInputFunction), 'cifar': (cifar_model, CIFARInputFunction), }[FLAGS.dataset] train_eval_input_fn = dataset(True, FLAGS.noise_dim) def unconditional_generator(noise, mode): """Generator with extra argument for tf.Estimator's `mode`.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) return model.generator(noise, is_training=is_training) def unconditional_discriminator(images, unused_conditioning): """Discriminator that conforms to TF-GAN API.""" return model.discriminator(images, is_training=True) # TPU-based estimator used for TRAIN and EVAL # TODO(joelshor): Add get_eval_metric_ops_fn. est = tfgan.estimator.TPUGANEstimator( generator_fn=unconditional_generator, discriminator_fn=unconditional_discriminator, generator_loss_fn=tfgan.losses.minimax_generator_loss, discriminator_loss_fn=tfgan.losses.minimax_discriminator_loss, generator_optimizer=tf.train.AdamOptimizer(FLAGS.learning_rate, 0.5), discriminator_optimizer=tf.train.AdamOptimizer(FLAGS.learning_rate, 0.5), joint_train=True, # train G and D jointly instead of sequentially. eval_on_tpu=True, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=_NUM_VIZ_IMAGES, use_tpu=FLAGS.use_tpu, config=config) # Get the tf.Estimator `input_fn` for training and evaluation. # train_eval_input_fn = functools.partial(input_fn, dataset=dataset) tf.gfile.MakeDirs(os.path.join(FLAGS.model_dir, 'generated_images')) current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info('Starting training for %d steps, current step: %d' % (FLAGS.train_steps, current_step)) # while current_step < FLAGS.train_steps: # next_checkpoint = min(current_step + FLAGS.train_steps_per_eval, # FLAGS.train_steps) tpupoint = TPUPoint( estimator=est, gcp_project=FLAGS.gcp_project, tpu_zone=FLAGS.tpu_zone, tpu=FLAGS.tpu, logdir=FLAGS.model_dir, workers_list=None, num_tracing_attempts=3, include_dataset_ops=False, # False for longer traces monitoring_level=1, # 1 or 2 logging level num_queries=4) tpupoint.optimize_input_fn(train_eval_input_fn, blocking=True) bench_start = time.time() # tpupoint.Start() # est.train(input_fn=train_eval_input_fn, max_steps=FLAGS.train_steps) tpupoint.train(estimator=est, input_fn=train_eval_input_fn, max_steps=FLAGS.train_steps) # tpupoint.Stop() bench_elapsed = time.time() - bench_start bench_total_dur = time.time() - bench_total_start tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds") tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds") tpupoint.CleanUp()
def resnet_run_optimize(): bench_total_start = time.time() params = params_dict.ParamsDict(resnet_config.RESNET_CFG, resnet_config.RESNET_RESTRICTIONS) params = params_dict.override_params_dict(params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params = flags_to_params.override_params_from_input_flags(params, FLAGS) params.validate() params.lock() tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or params.use_tpu) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if params.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(5000, params.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=params.iterations_per_loop, num_shards=params.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=params.use_tpu, model_fn=resnet_model_fn, config=config, params=params.as_dict(), train_batch_size=params.train_batch_size, eval_batch_size=params.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu) assert (params.precision == 'bfloat16' or params.precision == 'float32'), ('Invalid value for precision parameter; ' 'must be bfloat16 or float32.') tf.logging.info('Precision: %s', params.precision) use_bfloat16 = params.precision == 'bfloat16' cifar10_train, cifar10_eval = [ cifar10_input.CIFAR10Input( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params.transpose_input, cache=params.use_cache and is_training, image_size=params.image_size, num_parallel_calls=params.num_parallel_calls, include_background_label=(params.num_label_classes == 1001), use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = params.num_train_images // params.train_batch_size eval_steps = params.num_eval_images // params.eval_batch_size hooks = [] if params.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(5000, params.iterations_per_loop))) if FLAGS.profile_every_n_steps > 0: hooks.append( tpu_profiler_hook.TPUProfilerHook( save_steps=FLAGS.profile_every_n_steps, output_dir=FLAGS.model_dir, tpu=FLAGS.tpu)) tpupoint = TPUPoint( estimator=resnet_classifier, gcp_project=FLAGS.gcp_project, tpu_zone=FLAGS.tpu_zone, tpu=FLAGS.tpu, logdir=FLAGS.model_dir, workers_list=None, num_tracing_attempts=3, include_dataset_ops=False, # False for longer traces monitoring_level=1, # 1 or 2 logging level num_queries=4) input_fn = tpupoint.optimize_input_fn(cifar10_train.input_fn, blocking=True) bench_start = time.time() tpupoint.train(estimator=resnet_classifier, input_fn=input_fn, max_steps=params.train_steps, hooks=hooks) bench_elapsed = time.time() - bench_start bench_total_dur = time.time() - bench_total_start tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds") tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds")
def bert_run_optimize(): bench_total_start = time.time() tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "mnli": MnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) # if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: # raise ValueError( # "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None # if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) # if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) tpupoint = TPUPoint( estimator=estimator, gcp_project=FLAGS.gcp_project, tpu_zone=FLAGS.tpu_zone, tpu=FLAGS.tpu_name, logdir=FLAGS.output_dir, workers_list=None, num_tracing_attempts=3, include_dataset_ops=False, # False for longer traces monitoring_level=1, # 1 or 2 logging level num_queries=4) tpupoint.optimize_input_fn(train_input_fn, blocking=True) bench_start = time.time() # tpupoint.Start() # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) tpupoint.train(estimator=estimator, input_fn=train_input_fn, max_steps=num_train_steps) # tpupoint.Stop() bench_elapsed = time.time() - bench_start bench_total_dur = time.time() - bench_total_start tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds") tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds") tpupoint.CleanUp()
def qanet_run_optimize(): bench_total_start = time.time() # tf.logging.set_verbosity(tf.logging.INFO) cfg = create_config(model_dir=FLAGS.model_dir) if FLAGS.tpu: cfg.tpu.name = FLAGS.tpu cfg.tpu.zone = FLAGS.tpu_zone cfg.tpu.gcp_project = FLAGS.gcp_project cfg.tpu.enable = True else: # Toggle TPU relevant settings if FLAGS.enable_tpu: cfg.tpu.enable = True else: cfg.tpu.enable = False # train_and_eval(cfg, do_eval=("eval" in FLAGS.mode)) tf.logging.info("cfg.model_dir = " + cfg.model_dir) # Save out config to model directory # assert "train" in FLAGS.mode tf.gfile.MakeDirs(cfg.model_dir) with tf.gfile.GFile(os.path.join(cfg.model_dir, "config.json"), "w") as f: json.dump(cfg, f) if not cfg.dataset.num_repeats and not cfg.steps_per_epoch: raise ValueError("Must have a fixed num repeats or epoch step size.") # Construct inputs and estimator # train_input, eval_input = data.build_dataset( cfg.dataset, is_tpu=cfg.tpu.enable) train_input, eval_input = new_build_dataset(cfg.dataset, is_tpu=cfg.tpu.enable) estimator = model.get_estimator(**cfg) # if do_eval: # eval_metrics = None # for i in range(cfg.num_epochs): # tf.logging.info("Starting epoch %s/%s" % (i + 1, cfg.num_epochs)) # train_metrics = estimator.train( # input_fn=train_input, steps=cfg.steps_per_epoch or None) # tf.logging.info(pprint.pformat(train_metrics)) # eval_metrics = estimator.evaluate(input_fn=eval_input) # tf.logging.info(pprint.pformat(eval_metrics)) # if report_fn: # report_fn(eval_metrics) # return eval_metrics # else: # for i in range(cfg.num_epochs): # tf.logging.info("Starting epoch %s/%s" % (i + 1, cfg.num_epochs)) # train_metrics = estimator.train( # input_fn=train_input, steps=cfg.steps_per_epoch) # tf.logging.info(pprint.pformat(train_metrics)) tpupoint = TPUPoint( estimator=estimator, gcp_project=FLAGS.gcp_project, tpu_zone=FLAGS.tpu_zone, tpu=FLAGS.tpu, logdir=FLAGS.model_dir, workers_list=None, num_tracing_attempts=3, include_dataset_ops=False, # False for longer traces monitoring_level=1, # 1 or 2 logging level num_queries=4) tpupoint.optimize_input_fn(train_input, blocking=True) tf.logging.info("Starting training for %s steps" % (cfg.steps_per_epoch * cfg.num_epochs)) bench_start = time.time() # tpupoint.Start() # train_metrics = estimator.train( input_fn=train_input, steps=(cfg.steps_per_epoch * cfg.num_epochs)) tpupoint.train(estimator=estimator, input_fn=train_input, steps=(cfg.steps_per_epoch * cfg.num_epochs)) # tpupoint.Stop() bench_elapsed = time.time() - bench_start bench_total_dur = time.time() - bench_total_start # tf.logging.info(pprint.pformat(train_metrics)) tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds") tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds") tpupoint.CleanUp()
def bert_run_squad_optimize(): bench_total_start = time.time() tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = read_squad_examples(input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) # if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. train_writer = FeatureWriter(filename=os.path.join(FLAGS.output_dir, "train.tf_record"), is_training=True) convert_examples_to_features(examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) del train_examples tpupoint = TPUPoint( estimator=estimator, gcp_project=FLAGS.bench_gcp_project, tpu_zone=FLAGS.bench_tpu_zone, tpu=FLAGS.tpu_name, logdir=FLAGS.output_dir, workers_list=None, num_tracing_attempts=3, include_dataset_ops=False, # False for longer traces monitoring_level=1, # 1 or 2 logging level num_queries=4) train_input_fn = input_fn_builder(input_file=train_writer.filename, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) tpupoint.optimize_input_fn(train_input_fn, blocking=True) bench_start = time.time() # # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) tpupoint.train(estimator=estimator, input_fn=train_input_fn, max_steps=num_train_steps) bench_elapsed = time.time() - bench_start bench_total_dur = time.time() - bench_total_start tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds") tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds") tpupoint.CleanUp()