def run_wide_deep(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ # Clean up the model directory if present shutil.rmtree(flags_obj.model_dir, ignore_errors=True) model = build_estimator(flags_obj.model_dir, flags_obj.model_type) train_file = os.path.join(flags_obj.data_dir, 'adult.data') test_file = os.path.join(flags_obj.data_dir, 'adult.test') # Train and evaluate the model every `flags.epochs_between_evals` epochs. def train_input_fn(): return input_fn( train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size) def eval_input_fn(): return input_fn(test_file, 1, False, flags_obj.batch_size) run_params = { 'batch_size': flags_obj.batch_size, 'train_epochs': flags_obj.train_epochs, 'model_type': flags_obj.model_type, } benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info('wide_deep', 'Census Income', run_params) loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '') train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, batch_size=flags_obj.batch_size, tensors_to_log={'average_loss': loss_prefix + 'head/truediv', 'loss': loss_prefix + 'head/weighted_loss/Sum'}) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): model.train(input_fn=train_input_fn, hooks=train_hooks) results = model.evaluate(input_fn=eval_input_fn) # Display evaluation metrics tf.logging.info('Results at epoch %d / %d', (n + 1) * flags_obj.epochs_between_evals, flags_obj.train_epochs) tf.logging.info('-' * 60) for key in sorted(results): tf.logging.info('%s: %s' % (key, results[key])) benchmark_logger.log_evaluation_result(results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, results['accuracy']): break # Export the model if flags_obj.export_dir is not None: export_model(model, flags_obj.model_type, flags_obj.export_dir)
def validate_train_hook_name(self, test_hook_name, expected_hook_name, **kwargs): returned_hook = hooks_helper.get_train_hooks([test_hook_name], **kwargs) self.assertEqual(len(returned_hook), 1) self.assertIsInstance(returned_hook[0], tf.train.SessionRunHook) self.assertEqual(returned_hook[0].__class__.__name__.lower(), expected_hook_name)
def run_loop(name, train_input_fn, eval_input_fn, model_column_fn, build_estimator_fn, flags_obj, tensors_to_log, early_stop=False): """Define training loop.""" model_helpers.apply_clean(flags.FLAGS) model = build_estimator_fn( model_dir=flags_obj.model_dir, model_type=flags_obj.model_type, model_column_fn=model_column_fn, inter_op=flags_obj.inter_op_parallelism_threads, intra_op=flags_obj.intra_op_parallelism_threads) run_params = { 'batch_size': flags_obj.batch_size, 'train_epochs': flags_obj.train_epochs, 'model_type': flags_obj.model_type, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('wide_deep', name, run_params, test_id=flags_obj.benchmark_test_id) loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '') tensors_to_log = {k: v.format(loss_prefix=loss_prefix) for k, v in tensors_to_log.items()} train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): model.train(input_fn=train_input_fn, hooks=train_hooks) results = model.evaluate(input_fn=eval_input_fn) # Display evaluation metrics tf.logging.info('Results at epoch %d / %d', (n + 1) * flags_obj.epochs_between_evals, flags_obj.train_epochs) tf.logging.info('-' * 60) for key in sorted(results): tf.logging.info('%s: %s' % (key, results[key])) benchmark_logger.log_evaluation_result(results) if early_stop and model_helpers.past_stop_threshold( flags_obj.stop_threshold, results['accuracy']): break # Export the model if flags_obj.export_dir is not None: export_model(model, flags_obj.model_type, flags_obj.export_dir, model_column_fn)
def main(argv): parser = WideDeepArgParser() flags = parser.parse_args(args=argv[1:]) # Clean up the model directory if present shutil.rmtree(flags.model_dir, ignore_errors=True) model = build_estimator(flags.model_dir, flags.model_type) train_file = os.path.join(flags.data_dir, 'adult.data') test_file = os.path.join(flags.data_dir, 'adult.test') # Train and evaluate the model every `flags.epochs_between_evals` epochs. def train_input_fn(): return input_fn( train_file, flags.epochs_between_evals, True, flags.batch_size) def eval_input_fn(): return input_fn(test_file, 1, False, flags.batch_size) loss_prefix = LOSS_PREFIX.get(flags.model_type, '') train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, tensors_to_log={'average_loss': loss_prefix + 'head/truediv', 'loss': loss_prefix + 'head/weighted_loss/Sum'}) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(flags.train_epochs // flags.epochs_between_evals): model.train(input_fn=train_input_fn, hooks=train_hooks) results = model.evaluate(input_fn=eval_input_fn) # Display evaluation metrics print('Results at epoch', (n + 1) * flags.epochs_between_evals) print('-' * 60) for key in sorted(results): print('%s: %s' % (key, results[key])) if model_helpers.past_stop_threshold( flags.stop_threshold, results['accuracy']): break
def log_and_get_hooks(eval_batch_size): """Convenience function for hook and logger creation.""" # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"} ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) return benchmark_logger, train_hooks
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = (flags_obj.batch_size or ( params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_device_batch_size( params["batch_size"], num_gpus) schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards ) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) if flags_obj.export_dir and not params["use_tpu"]: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file}, strip_default_attrs=True)
def test_raise_in_non_list_names(self): with self.assertRaises(ValueError): hooks_helper.get_train_hooks( 'LoggingTensorHook, ProfilerHook', model_dir="", batch_size=256)
def run(_): tf.compat.v1.set_random_seed(flags_obj.seed) print("Data pre-processing...") train_speech_dataset = generate_dataset(flags_obj.train_data_dir) eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir) num_classes = len(train_speech_dataset.speech_labels) num_gpus = flags_core.get_num_gpus(flags_obj) distribution_strategy = distribution_utils.get_distribution_strategy( num_gpus=num_gpus) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=flags_obj.model_dir, config=run_config, params={ "num_classes": num_classes, }) run_params = { "batch_size": flags_obj.batch_size, "train_epochs": flags_obj.train_epochs, "rnn_hidden_size": flags_obj.rnn_hidden_size, "rnn_hidden_layers": flags_obj.rnn_hidden_layers, "rnn_type": flags_obj.rnn_type, "is_bidirectional": flags_obj.is_bidirectional, "use_bias": flags_obj.use_bias } dataset_name = "LibriSpeech" benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info("deep_speech", dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) per_replica_batch_size = distribution_utils.per_replica_batch_size( flags_obj.batch_size, num_gpus) def input_fn_train(): return input_fn(per_replica_batch_size, train_speech_dataset) def input_fn_eval(): return input_fn(per_replica_batch_size, eval_speech_dataset) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): print('Starting a training cycle: %d/%d' % (cycle_index + 1, total_training_cycle)) train_speech_dataset.entries = fn.batch_wise_dataset_shuffle( train_speech_dataset.entries, cycle_index, flags_obj.sortagrad, flags_obj.batch_size) estimator.train(input_fn=input_fn_train, hooks=train_hooks) eval_results = evaluate_model(estimator, eval_speech_dataset.speech_labels, eval_speech_dataset.entries, input_fn_eval) print("Iteration {}: WER = {:.2f}, CER = {:.2f}".format( cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY])) if model_helpers.past_stop_threshold(flags_obj.wer_threshold, eval_results[_WER_KEY]): break
def run_mnist(flags_obj): """Run MNIST training and eval loop. Args: flags_obj: An object containing parsed flag values. """ model_function = model_fn if flags_obj.multi_gpu: validate_batch_size_for_multi_gpu(flags_obj.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( model_fn, loss_reduction=tf.losses.Reduction.MEAN) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') mnist_classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, params={ 'data_format': data_format, 'multi_gpu': flags_obj.multi_gpu }) # Set up training and evaluation input functions. def train_input_fn(): """Prepare data for training.""" # When choosing shuffle buffer sizes, larger sizes result in better # randomness, while smaller sizes use less memory. MNIST is a small # enough dataset that we can easily shuffle the full epoch. ds = dataset.train(flags_obj.data_dir) ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size) # Iterate through the dataset a set number (`epochs_between_evals`) of times # during each training session. ds = ds.repeat(flags_obj.epochs_between_evals) return ds def eval_input_fn(): return dataset.test(flags_obj.data_dir).batch( flags_obj.batch_size).make_one_shot_iterator().get_next() # Set up hook that outputs training logs every 100 steps. train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, batch_size=flags_obj.batch_size) # Train and evaluate model. for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break # Export the model if flags_obj.export_dir is not None: image = tf.placeholder(tf.float32, [None, 28, 28]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) movielens_dataset.construct_train_eval_csv( data_dir=FLAGS.data_dir, dataset=FLAGS.dataset) tf.logging.info("Data preprocessing...") ncf_dataset = movielens_dataset.data_preprocessing( FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg) model_helpers.apply_clean(flags.FLAGS) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF( ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Training and evaluation cycle def get_train_input_fn(): return movielens_dataset.get_input_fn( True, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals) def get_pred_input_fn(): return movielens_dataset.get_input_fn( False, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks) # Evaluate the model eval_results = evaluate_model( estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn()) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int( np.ceil(FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int( np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) params = { "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, "use_estimator": FLAGS.use_estimator, } if FLAGS.use_estimator: train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, iterations=num_train_steps, params=params, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) else: runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps, num_eval_steps, FLAGS.use_while_loop) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) eval_input_fn = None total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model if FLAGS.use_estimator: train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if eval_input_fn is None: eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") else: runner.train() tf.logging.info("Beginning evaluation.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = runner.eval() tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_mnist(flags_obj): """Run MNIST training and eval loop. Args: flags_obj: An object containing parsed flag values. """ model_helpers.apply_clean(flags_obj) model_function = model_fn session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), all_reduce_alg=flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') mnist_classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'data_format': data_format, }) # Set up training and evaluation input functions. def train_input_fn(): """Prepare data for training.""" # When choosing shuffle buffer sizes, larger sizes result in better # randomness, while smaller sizes use less memory. MNIST is a small # enough dataset that we can easily shuffle the full epoch. ds = dataset.train(flags_obj.data_dir) ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size) # Iterate through the dataset a set number (`epochs_between_evals`) of times # during each training session. ds = ds.repeat(flags_obj.epochs_between_evals) return ds def eval_input_fn(): return dataset.test(flags_obj.data_dir).batch( flags_obj.batch_size).make_one_shot_iterator().get_next() # Set up hook that outputs training logs every 100 steps. train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) # Train and evaluate model. for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break # Export the model if flags_obj.export_dir is not None: image = tf.placeholder(tf.float32, [None, 28, 28]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn, strip_default_attrs=True)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] approx_train_steps = None else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"} ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if approx_train_steps and np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") eval_results = eval_estimator.evaluate(pred_input_fn) tf.logging.info("Evaluation complete.") # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[rconst.HR_KEY] ndcg = eval_results[rconst.NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_deep_speech(_): """Run deep speech training and eval loop.""" tf.set_random_seed(flags_obj.seed) # Data preprocessing tf.logging.info("Data preprocessing...") train_speech_dataset = generate_dataset(flags_obj.train_data_dir) eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir) # Number of label classes. Label string is "[a-z]' -" num_classes = len(train_speech_dataset.speech_labels) # Use distribution strategy for multi-gpu training num_gpus = flags_core.get_num_gpus(flags_obj) distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus=num_gpus) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=flags_obj.model_dir, config=run_config, params={ "num_classes": num_classes, } ) # Benchmark logging run_params = { "batch_size": flags_obj.batch_size, "train_epochs": flags_obj.train_epochs, "rnn_hidden_size": flags_obj.rnn_hidden_size, "rnn_hidden_layers": flags_obj.rnn_hidden_layers, "rnn_type": flags_obj.rnn_type, "is_bidirectional": flags_obj.is_bidirectional, "use_bias": flags_obj.use_bias } dataset_name = "LibriSpeech" benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info("deep_speech", dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) per_replica_batch_size = per_device_batch_size(flags_obj.batch_size, num_gpus) def input_fn_train(): return dataset.input_fn( per_replica_batch_size, train_speech_dataset) def input_fn_eval(): return dataset.input_fn( per_replica_batch_size, eval_speech_dataset) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1, total_training_cycle) # Perform batch_wise dataset shuffling train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle( train_speech_dataset.entries, cycle_index, flags_obj.sortagrad, flags_obj.batch_size) estimator.train(input_fn=input_fn_train, hooks=train_hooks) # Evaluation tf.logging.info("Starting to evaluate...") eval_results = evaluate_model( estimator, eval_speech_dataset.speech_labels, eval_speech_dataset.entries, input_fn_eval) # Log the WER and CER results. benchmark_logger.log_evaluation_result(eval_results) tf.logging.info( "Iteration {}: WER = {:.2f}, CER = {:.2f}".format( cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY])) # If some evaluation threshold is met if model_helpers.past_stop_threshold( flags_obj.wer_threshold, eval_results[_WER_KEY]): break
def resnet_main(seed, flags, model_function, input_function, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ mlperf_log.resnet_print(key=mlperf_log.RUN_START) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) if flags.num_gpus == 0: distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0') elif flags.num_gpus == 1: distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.contrib.distribute.MirroredStrategy( num_gpus=flags.num_gpus) mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed) run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config, tf_random_seed=seed) mlperf_log.resnet_print(key=mlperf_log.INPUT_BATCH_SIZE, value=flags.batch_size) classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'version': flags.version, 'loss_scale': flags.loss_scale, 'dtype': flags.dtype }) if flags.benchmark_log_dir is not None: benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir) benchmark_logger.log_run_info('resnet') else: benchmark_logger = None mlperf_log.resnet_print(key=mlperf_log.TRAIN_LOOP) # The reference performs the first evaluation on the fourth epoch. (offset # eval by 3 epochs) mlperf_log.resnet_print(key=mlperf_log.EVAL_EPOCH_OFFSET, value=3) success = False for i in range(flags.train_epochs // flags.epochs_between_evals): # Data for epochs_between_evals (i.e. 4 epochs between evals) worth of # epochs is concatenated and run as a single block inside a session. For # this reason we declare all of the epochs that will be run at the start. # Submitters may report in a way which is reasonable for their control flow. for j in range(flags.epochs_between_evals): mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH, value=i * flags.epochs_between_evals + j) train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) _log_cache = [] def formatter(x): """Abuse side effects to get tensors out of the model_fn.""" if _log_cache: _log_cache.pop() _log_cache.append(x.copy()) return str(x) compliance_hook = tf.train.LoggingTensorHook( tensors={_NUM_EXAMPLES_NAME: _NUM_EXAMPLES_NAME}, every_n_iter=int(1e10), at_end=True, formatter=formatter) print('Starting a training cycle.') def input_fn_train(): return input_function(is_training=True, data_dir=flags.data_dir, batch_size=per_device_batch_size( flags.batch_size, flags.num_gpus), num_epochs=flags.epochs_between_evals, num_gpus=flags.num_gpus, dtype=flags.dtype) classifier.train(input_fn=input_fn_train, hooks=train_hooks + [compliance_hook], max_steps=flags.max_train_steps) train_examples = int(_log_cache.pop()[_NUM_EXAMPLES_NAME]) mlperf_log.resnet_print(key=mlperf_log.INPUT_SIZE, value=train_examples) print('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function(is_training=False, data_dir=flags.data_dir, batch_size=per_device_batch_size( flags.batch_size, flags.num_gpus), num_epochs=1, dtype=flags.dtype) mlperf_log.resnet_print(key=mlperf_log.EVAL_START) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps) mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP) mlperf_log.resnet_print(key=mlperf_log.EVAL_SIZE, value=int(eval_results[_NUM_EXAMPLES_NAME])) mlperf_log.resnet_print(key=mlperf_log.EVAL_ACCURACY, value=float(eval_results['accuracy'])) mlperf_log.resnet_print(key=mlperf_log.EVAL_TARGET, value=flags.stop_threshold) print(eval_results) if benchmark_logger: benchmark_logger.log_estimator_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags.stop_threshold, eval_results['accuracy']): success = True break mlperf_log.resnet_print(key=mlperf_log.RUN_STOP, value={"success": success}) mlperf_log.resnet_print(key=mlperf_log.RUN_FINAL)
def convinh_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for convinh Models. Args: flags_obj: An object containing parsed flags. See define_convinh_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj. _dir is passed. """ model_helpers.apply_clean(flags.FLAGS) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig( tf_random_seed=flags_obj.seed, train_distribute=distribution_strategy, session_config=session_config, keep_checkpoint_max = flags_obj.num_ckpt ) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'model_params':{ 'data_format':flags_obj.data_format, 'filters':list(map(int,flags_obj.filters)), 'ratio_PV': flags_obj.ratio_PV, 'ratio_SST': flags_obj.ratio_SST, 'conv_kernel_size':list(map(int,flags_obj.conv_kernel_size)), 'conv_kernel_size_inh':list(map(int,flags_obj.conv_kernel_size_inh)), 'conv_strides':list(map(int,flags_obj.conv_strides)), 'pool_size':list(map(int,flags_obj.pool_size)), 'pool_strides':list(map(int,flags_obj.pool_strides)), 'num_ff_layers':flags_obj.num_ff_layers, 'num_rnn_layers':flags_obj.num_rnn_layers, 'connection':flags_obj.connection, 'n_time':flags_obj.n_time, 'cell_fn':flags_obj.cell_fn, 'act_fn':flags_obj.act_fn, 'pvsst_circuit':flags_obj.pvsst_circuit, 'gating':flags_obj.gating, 'normalize':flags_obj.normalize, 'num_classes':flags_obj.num_classes }, 'batch_size' : flags_obj.batch_size, 'weight_decay': flags_obj.weight_decay, 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'convinh_size': flags_obj.convinh_size, # deprecated 'convinh_version': flags_obj.convinh_version, # deprecated 'synthetic_data': flags_obj.use_synthetic_data, # deprecated 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('convinh', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) class input_fn_train(object): def __init__(self,num_epochs): self._num_epochs = num_epochs def __call__(self): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=self._num_epochs, num_gpus=flags_core.get_num_gpus(flags_obj)) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) tf.logging.info('Evaluate the intial model.') eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) # training total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) + 1 for cycle_index in range(total_training_cycle): cur_train_epochs = flags_obj.epochs_between_evals if cycle_index else 1 tf.logging.info('Starting a training cycle: %d/%d, with %d epochs', cycle_index, total_training_cycle, cur_train_epochs) classifier.train(input_fn=input_fn_train(cur_train_epochs), hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=1) if cycle_index==0: classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, checkpoint_path='{}/model.ckpt-0'.format(flags_obj.model_dir)) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None, num_images=None, zeroshot_eval=False): model_helpers.apply_clean(flags.FLAGS) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = config_utils.get_session_config(flags_obj) run_config = config_utils.get_run_config(flags_obj, flags_core, session_config, num_images['train']) def gen_estimator(period=None): resnet_size = int(flags_obj.resnet_size) data_format = flags_obj.data_format batch_size = flags_obj.batch_size resnet_version = int(flags_obj.resnet_version) loss_scale = flags_core.get_loss_scale(flags_obj) dtype_tf = flags_core.get_tf_dtype(flags_obj) num_epochs_per_decay = flags_obj.num_epochs_per_decay learning_rate_decay_factor = flags_obj.learning_rate_decay_factor end_learning_rate = flags_obj.end_learning_rate learning_rate_decay_type = flags_obj.learning_rate_decay_type weight_decay = flags_obj.weight_decay zero_gamma = flags_obj.zero_gamma lr_warmup_epochs = flags_obj.lr_warmup_epochs base_learning_rate = flags_obj.base_learning_rate use_resnet_d = flags_obj.use_resnet_d use_dropblock = flags_obj.use_dropblock dropblock_kp = [float(be) for be in flags_obj.dropblock_kp] label_smoothing = flags_obj.label_smoothing momentum = flags_obj.momentum bn_momentum = flags_obj.bn_momentum train_epochs = flags_obj.train_epochs piecewise_lr_boundary_epochs = [ int(be) for be in flags_obj.piecewise_lr_boundary_epochs ] piecewise_lr_decay_rates = [ float(dr) for dr in flags_obj.piecewise_lr_decay_rates ] use_ranking_loss = flags_obj.use_ranking_loss use_se_block = flags_obj.use_se_block use_sk_block = flags_obj.use_sk_block mixup_type = flags_obj.mixup_type dataset_name = flags_obj.dataset_name kd_temp = flags_obj.kd_temp no_downsample = flags_obj.no_downsample anti_alias_filter_size = flags_obj.anti_alias_filter_size anti_alias_type = flags_obj.anti_alias_type cls_loss_type = flags_obj.cls_loss_type logit_type = flags_obj.logit_type embedding_size = flags_obj.embedding_size pool_type = flags_obj.pool_type arc_s = flags_obj.arc_s arc_m = flags_obj.arc_m bl_alpha = flags_obj.bl_alpha bl_beta = flags_obj.bl_beta exp = None if install_hyperdash and flags_obj.use_hyperdash: exp = Experiment(flags_obj.model_dir.split("/")[-1]) resnet_size = exp.param("resnet_size", int(flags_obj.resnet_size)) batch_size = exp.param("batch_size", flags_obj.batch_size) exp.param("dtype", flags_obj.dtype) learning_rate_decay_type = exp.param( "learning_rate_decay_type", flags_obj.learning_rate_decay_type) weight_decay = exp.param("weight_decay", flags_obj.weight_decay) zero_gamma = exp.param("zero_gamma", flags_obj.zero_gamma) lr_warmup_epochs = exp.param("lr_warmup_epochs", flags_obj.lr_warmup_epochs) base_learning_rate = exp.param("base_learning_rate", flags_obj.base_learning_rate) use_dropblock = exp.param("use_dropblock", flags_obj.use_dropblock) dropblock_kp = exp.param( "dropblock_kp", [float(be) for be in flags_obj.dropblock_kp]) piecewise_lr_boundary_epochs = exp.param( "piecewise_lr_boundary_epochs", [int(be) for be in flags_obj.piecewise_lr_boundary_epochs]) piecewise_lr_decay_rates = exp.param( "piecewise_lr_decay_rates", [float(dr) for dr in flags_obj.piecewise_lr_decay_rates]) mixup_type = exp.param("mixup_type", flags_obj.mixup_type) dataset_name = exp.param("dataset_name", flags_obj.dataset_name) exp.param("autoaugment_type", flags_obj.autoaugment_type) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': resnet_size, 'data_format': data_format, 'batch_size': batch_size, 'resnet_version': resnet_version, 'loss_scale': loss_scale, 'dtype': dtype_tf, 'num_epochs_per_decay': num_epochs_per_decay, 'learning_rate_decay_factor': learning_rate_decay_factor, 'end_learning_rate': end_learning_rate, 'learning_rate_decay_type': learning_rate_decay_type, 'weight_decay': weight_decay, 'zero_gamma': zero_gamma, 'lr_warmup_epochs': lr_warmup_epochs, 'base_learning_rate': base_learning_rate, 'use_resnet_d': use_resnet_d, 'use_dropblock': use_dropblock, 'dropblock_kp': dropblock_kp, 'label_smoothing': label_smoothing, 'momentum': momentum, 'bn_momentum': bn_momentum, 'embedding_size': embedding_size, 'train_epochs': train_epochs, 'piecewise_lr_boundary_epochs': piecewise_lr_boundary_epochs, 'piecewise_lr_decay_rates': piecewise_lr_decay_rates, 'with_drawing_bbox': flags_obj.with_drawing_bbox, 'use_ranking_loss': use_ranking_loss, 'use_se_block': use_se_block, 'use_sk_block': use_sk_block, 'mixup_type': mixup_type, 'kd_temp': kd_temp, 'no_downsample': no_downsample, 'dataset_name': dataset_name, 'anti_alias_filter_size': anti_alias_filter_size, 'anti_alias_type': anti_alias_type, 'cls_loss_type': cls_loss_type, 'logit_type': logit_type, 'arc_s': arc_s, 'arc_m': arc_m, 'pool_type': pool_type, 'bl_alpha': bl_alpha, 'bl_beta': bl_beta, 'train_steps': total_train_steps, }) return classifier, exp run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs): return input_function(is_training=True, use_random_crop=flags_obj.training_random_crop, num_epochs=num_epochs, flags_obj=flags_obj) def input_fn_eval(): return input_function(is_training=False, use_random_crop=False, num_epochs=1, flags_obj=flags_obj) ckpt_keeper = checkpoint_utils.CheckpointKeeper( save_dir=flags_obj.model_dir, num_to_keep=flags_obj.num_best_ckpt_to_keep, keep_epoch=flags_obj.keep_ckpt_every_eval, maximize=True) if zeroshot_eval: dataset = data_config.get_config(dataset_name) model = model_fns.Model( int(flags_obj.resnet_size), flags_obj.data_format, resnet_version=int(flags_obj.resnet_version), num_classes=dataset.num_classes, zero_gamma=flags_obj.zero_gamma, use_se_block=flags_obj.use_se_block, use_sk_block=flags_obj.use_sk_block, no_downsample=flags_obj.no_downsample, anti_alias_filter_size=flags_obj.anti_alias_filter_size, anti_alias_type=flags_obj.anti_alias_type, bn_momentum=flags_obj.bn_momentum, embedding_size=flags_obj.embedding_size, pool_type=flags_obj.pool_type, bl_alpha=flags_obj.bl_alpha, bl_beta=flags_obj.bl_beta, dtype=flags_core.get_tf_dtype(flags_obj), loss_type=flags_obj.cls_loss_type) def train_and_evaluate(hooks): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), hooks=hooks, steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') if zeroshot_eval: tf.reset_default_graph() eval_results = recall_metric.recall_at_k( flags_obj, flags_core, input_fns.input_fn_ir_eval, model, num_images['validation'], eval_similarity=flags_obj.eval_similarity, return_embedding=True) else: eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) return eval_results total_train_steps = flags_obj.train_epochs * int( num_images['train'] / flags_obj.batch_size) if flags_obj.eval_only or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 elif flags_obj.export_only: schedule, n_loops = [], 0 else: n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [ flags_obj.epochs_between_evals for _ in range(int(n_loops)) ] schedule[-1] = flags_obj.train_epochs - sum( schedule[:-1]) # over counting. schedule = config_utils.get_epoch_schedule(flags_obj, schedule, num_images) tf.logging.info('epoch schedule:') tf.logging.info(schedule) classifier, exp = gen_estimator() if flags_obj.pretrained_model_checkpoint_path: warm_start_hook = WarmStartHook( flags_obj.pretrained_model_checkpoint_path) train_hooks.append(warm_start_hook) for cycle_index, num_train_epochs in enumerate(schedule): eval_results = train_and_evaluate(train_hooks) if zeroshot_eval: metric = eval_results['recall_at_1'] else: metric = eval_results['accuracy'] ckpt_keeper.save(metric, flags_obj.model_dir) if exp: exp.metric("accuracy", metric) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, metric): break if model_helpers.past_stop_threshold(total_train_steps, eval_results['global_step']): break if exp: exp.end() if flags_obj.export_dir is not None: export_utils.export_pb(flags_core, flags_obj, shape, classifier)
def main(_): """Train NCF model and evaluate its hit rate (HR) metric.""" params = create_params() if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, epoch_dir=os.path.join(params["model_dir"], "epoch"), params=get_params_for_dataset(params), constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] producer.start() params["num_users"] = num_users params["num_items"] = num_items feature_columns = create_feature_columns(params) model_fn = create_model_fn(feature_columns) estimator = create_tpu_estimator(model_fn, feature_columns, params) train_hooks = hooks_helper.get_train_hooks( ["ProfilerHook"], model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"} ) for cycle_index in range(FLAGS.train_epochs): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, FLAGS.train_epochs)) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) producer.stop_loop() producer.join()
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ model_helpers.apply_clean(flags.FLAGS) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.epochs_between_evals, num_gpus=flags_core.get_num_gpus(flags_obj)) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) tf.logging.info( "Convert types of eval_results (np.xx types) to std python (needed for json.dump():" ) for key, value in eval_results.items(): eval_results[key] = value.item() #print(key, eval_results[key], type(eval_results[key])) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) savedmodel_raw_path = classifier.export_savedmodel( flags_obj.export_dir, input_receiver_fn) savedmodel_raw_path = savedmodel_raw_path.decode( ) # convert a byte string to a normal string if (flags_obj.benchmark_logger_type == "BenchmarkFileLogger" and hasattr(flags_obj, "benchmark_log_dir")): log_dir = flags_obj.benchmark_log_dir benchmark_log_path = os.path.join( log_dir, logger.BENCHMARK_RUN_LOG_FILE_NAME) # attempt to identify a number of GPUs or was it on CPU with open(benchmark_log_path, "r") as json_file: log = json.load(json_file) num_gpus = int(log["machine_config"]["gpu_info"]["count"]) gpu_info = "" if num_gpus == 0: gpu_info = "_cpu" elif num_gpus > 0: gpu_info = "_" + str(num_gpus) + "gpu" # add "_cpu" or "_gpu" to just saved graph directory savedmodel_path = savedmodel_raw_path + gpu_info shutil.move(savedmodel_raw_path, savedmodel_path) # move benchmark_run.log and metric.log to the just saved graph shutil.move(benchmark_log_path, savedmodel_path) shutil.move(os.path.join(log_dir, logger.METRIC_LOG_FILE_NAME), savedmodel_path) tf.logging.info("[INFO]: SavedModel path: %s", savedmodel_path) # zip the trained graph, aka savedmodel: # adapted from https://stackoverflow.com/questions/1855095/how-to-create-a-zip-archive-of-a-directory-in-python # full path to the zip file graph_zip_path = savedmodel_path + '.zip' # cd to the directory with the trained graph os.chdir(os.path.dirname(savedmodel_path.rstrip('/'))) dir_to_zip = savedmodel_path.rstrip('/').split('/')[-1] graph_zip = zipfile.ZipFile(graph_zip_path, 'w', zipfile.ZIP_DEFLATED) for root, dirs, files in os.walk(dir_to_zip): for file in files: graph_zip.write(os.path.join(root, file)) graph_zip.close() return graph_zip_path
def run_mnist(flags_obj): """Run MNIST training and eval loop. Args: flags_obj: An object containing parsed flag values. """ model_helpers.apply_clean(flags_obj) model_function = model_fn session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') mnist_classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'data_format': data_format, }) # Set up training and evaluation input functions. def train_input_fn(): """Prepare data for training.""" # When choosing shuffle buffer sizes, larger sizes result in better # randomness, while smaller sizes use less memory. MNIST is a small # enough dataset that we can easily shuffle the full epoch. ds = dataset.train(flags_obj.data_dir) ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size) # Iterate through the dataset a set number (`epochs_between_evals`) of times # during each training session. ds = ds.repeat(flags_obj.epochs_between_evals) return ds def eval_input_fn(): return dataset.test(flags_obj.data_dir).batch( flags_obj.batch_size).make_one_shot_iterator().get_next() # Set up hook that outputs training logs every 100 steps. train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) # Train and evaluate model. for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break # Export the model if flags_obj.export_dir is not None: image = tf.placeholder(tf.float32, [None, 28, 28]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn, strip_default_attrs=True)
def resnet_main(seed, flags, model_function, input_function, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) if flags.num_gpus == 0: distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0') elif flags.num_gpus == 1: distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.contrib.distribute.MirroredStrategy( num_gpus=flags.num_gpus ) run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config, tf_random_seed=seed) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'version': flags.version, 'loss_scale': flags.loss_scale, 'dtype': flags.dtype }) if flags.benchmark_log_dir is not None: benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir) benchmark_logger.log_run_info('resnet') else: benchmark_logger = None for _ in range(flags.train_epochs // flags.epochs_between_evals): train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) print('Starting a training cycle.') def input_fn_train(): return input_function( is_training=True, data_dir=flags.data_dir, batch_size=per_device_batch_size(flags.batch_size, flags.num_gpus), num_epochs=flags.epochs_between_evals, ) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags.max_train_steps) print('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function( is_training=False, data_dir=flags.data_dir, batch_size=per_device_batch_size(flags.batch_size, flags.num_gpus), num_epochs=1, ) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps) print(eval_results) if benchmark_logger: benchmark_logger.log_estimator_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags.stop_threshold, eval_results['accuracy']): break if flags.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags.batch_size) classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
def test_raise_in_invalid_names(self): invalid_names = ['StepCounterHook', 'StopAtStepHook'] with self.assertRaises(ValueError): hooks_helper.get_train_hooks(invalid_names, model_dir="", batch_size=256)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) movielens_dataset.construct_train_eval_csv(data_dir=FLAGS.data_dir, dataset=FLAGS.dataset) tf.logging.info("Data preprocessing...") ncf_dataset = movielens_dataset.data_preprocessing(FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg) model_helpers.apply_clean(flags.FLAGS) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF(ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Training and evaluation cycle def get_train_input_fn(): return movielens_dataset.get_input_fn( True, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals) def get_pred_input_fn(): return movielens_dataset.get_input_fn( False, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks) # Evaluate the model eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn()) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = ( flags_obj.batch_size or (params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_replica_batch_size( params["batch_size"], num_gpus) schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) if flags_obj.export_dir and not params["use_tpu"]: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file}, strip_default_attrs=True)
def test_raise_in_non_list_names(self): with self.assertRaises(ValueError): hooks_helper.get_train_hooks( 'LoggingTensorHook, ProfilerHook', batch_size=256)
def resnet_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. Dict of results of the run. Contains the keys `eval_results` and `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5. `train_hooks` is a list the instances of hooks used during training. """ model_helpers.apply_clean(flags.FLAGS) # Ensures flag override logic is only executed if explicitly triggered. if flags_obj.tf_gpu_thread_mode: override_flags_and_set_envars_for_gpu_thread_pool(flags_obj) # Configures cluster spec for distribution strategy. num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. session_config = tf.compat.v1.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), num_workers=num_workers, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) # Creates a `RunConfig` that checkpoints every 24 hours which essentially # results in checkpoints determined only by `epochs_between_evals`. run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config, save_checkpoints_secs=60*60*24, save_checkpoints_steps=None) # Initializes model with all but the dense layer from pretrained ResNet. if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune, 'num_workers': num_workers, }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, 'num_workers': num_workers, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs, input_context=None): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_replica_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, dtype=flags_core.get_tf_dtype(flags_obj), datasets_num_private_threads=flags_obj.datasets_num_private_threads, num_parallel_batches=flags_obj.datasets_num_parallel_batches, input_context=input_context) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_replica_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else flags_obj.train_epochs) use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1 if use_train_and_evaluate: train_spec = tf.estimator.TrainSpec( input_fn=lambda input_context=None: input_fn_train( train_epochs, input_context=input_context), hooks=train_hooks, max_steps=flags_obj.max_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval) tf.compat.v1.logging.info('Starting to train and evaluate.') tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) # tf.estimator.train_and_evalute doesn't return anything in multi-worker # case. eval_results = {} else: if train_epochs == 0: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals) schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))] schedule[-1] = train_epochs - sum(schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: # Since we are calling classifier.train immediately in each loop, the # value of num_train_epochs in the lambda function will not be changed # before it is used. So it is safe to ignore the pylint error here # pylint: disable=cell-var-from-loop classifier.train( input_fn=lambda input_context=None: input_fn_train( num_train_epochs, input_context=input_context), hooks=train_hooks, max_steps=flags_obj.max_train_steps) # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, # which will iterate forever. Passing steps=flags_obj.max_train_steps # allows the eval (which is generally unimportant in those circumstances) # to terminate. Note that eval will run for max_train_steps each loop, # regardless of the global_step count. tf.compat.v1.logging.info('Starting to evaluate.') eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. export_dtype = flags_core.get_tf_dtype(flags_obj) if flags_obj.image_bytes_as_serving_input: input_receiver_fn = functools.partial( image_bytes_serving_input_fn, shape, dtype=export_dtype) else: input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size, dtype=export_dtype) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True) stats = {} stats['eval_results'] = eval_results stats['train_hooks'] = train_hooks return stats
def run_mnist(flags_obj): """Run MNIST training and eval loop. Args: flags_obj: An object containing parsed flag values. """ model_helpers.apply_clean(flags_obj) model_function = model_fn # Get number of GPUs as defined by the --num_gpus flags and the number of # GPUs available on the machine. num_gpus = flags_core.get_num_gpus(flags_obj) multi_gpu = num_gpus > 1 if multi_gpu: # Validate that the batch size can be split into devices. distribution_utils.per_device_batch_size(flags_obj.batch_size, num_gpus) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( model_fn, loss_reduction=tf.losses.Reduction.MEAN, devices=["/device:GPU:%d" % d for d in range(num_gpus)]) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') mnist_classifier = tf.estimator.Estimator(model_fn=model_function, params={ 'data_format': data_format, 'multi_gpu': multi_gpu }) # Set up training and evaluation input functions. def train_input_fn(): """Prepare data for training.""" # When choosing shuffle buffer sizes, larger sizes result in better # randomness, while smaller sizes use less memory. MNIST is a small # enough dataset that we can easily shuffle the full epoch. ds = dataset.train(flags_obj.data_dir) def invert(image, label): return (image * -1.0) + 1.0, label def brightness(image, label): return tf.image.random_brightness(image, max_delta=0.2), label if INVERT: inverted = ds.map(invert) ds = ds.concatenate(inverted) if BRIGHTNESS: ds = ds.concatenate(ds.map(brightness)) ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size) # Iterate through the dataset a set number (`epochs_between_evals`) of times # during each training session. ds = ds.repeat(flags_obj.epochs_between_evals) return ds def eval_input_fn(): return dataset.test(flags_obj.data_dir).batch( flags_obj.batch_size).make_one_shot_iterator().get_next() # Set up hook that outputs training logs every 100 steps. train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, batch_size=flags_obj.batch_size) # Train and evaluate model. for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break # Export the model if flags_obj.export_dir is not None: image = tf.placeholder(tf.float32, [None, 28, 28]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn) def our_test_fn(): images = [] for i in list(range(1, 10)) + ['dog']: images.append( np.array(imageio.imread('{}.png'.format(i)).ravel() / 255.0, dtype='float32')) images = np.array(images) return tf.convert_to_tensor(images) # Check our own examples predictions = mnist_classifier.predict(input_fn=our_test_fn) table = [] for i in list(range(1, 10)) + ['dog']: prediction = next(predictions) if i == 'dog': print("{}. CNN thinks it's a {} ({:.1f}%)".format( i, prediction['classes'], prediction['probabilities'][prediction['classes']] * 100)) else: print("{} at {:.1f}. CNN thinks it's a {} ({:.1f}%)".format( i, prediction['probabilities'][i] * 100, prediction['classes'], prediction['probabilities'][prediction['classes']] * 100)) table.append((i, prediction['probabilities']))
def run_loop(name, train_input_fn, eval_input_fn, model_column_fn, build_estimator_fn, flags_obj, tensors_to_log, early_stop=False): """Define training loop.""" model_helpers.apply_clean(flags.FLAGS) model = build_estimator_fn(model_dir=flags_obj.model_dir, model_type=flags_obj.model_type, model_column_fn=model_column_fn, inter_op=flags_obj.inter_op_parallelism_threads, intra_op=flags_obj.intra_op_parallelism_threads) run_params = { 'batch_size': flags_obj.batch_size, 'train_epochs': flags_obj.train_epochs, 'model_type': flags_obj.model_type, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('wide_deep', name, run_params, test_id=flags_obj.benchmark_test_id) loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '') tensors_to_log = { k: v.format(loss_prefix=loss_prefix) for k, v in tensors_to_log.items() } train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): model.train(input_fn=train_input_fn, hooks=train_hooks) results = model.evaluate(input_fn=eval_input_fn) # Display evaluation metrics tf.logging.info('Results at epoch %d / %d', (n + 1) * flags_obj.epochs_between_evals, flags_obj.train_epochs) tf.logging.info('-' * 60) for key in sorted(results): tf.logging.info('%s: %s' % (key, results[key])) benchmark_logger.log_evaluation_result(results) if early_stop and model_helpers.past_stop_threshold( flags_obj.stop_threshold, results['accuracy']): break # Export the model if flags_obj.export_dir is not None: export_model(model, flags_obj.model_type, flags_obj.export_dir, model_column_fn)
def resnet_main(flags, model_function, input_function, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' if flags.multi_gpu: validate_batch_size_for_multi_gpu(flags.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( model_function, loss_reduction=tf.losses.Reduction.MEAN) # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) # Set up a RunConfig to save checkpoint and set session config. run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'multi_gpu': flags.multi_gpu, 'version': flags.version, }) if flags.benchmark_log_dir is not None: benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir) benchmark_logger.log_run_info("resnet") else: benchmark_logger = None for _ in range(flags.train_epochs // flags.epochs_between_evals): train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) print('Starting a training cycle.') def input_fn_train(): return input_function(True, flags.data_dir, flags.batch_size, flags.epochs_between_evals, flags.num_parallel_calls, flags.multi_gpu) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags.max_train_steps) print('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function(False, flags.data_dir, flags.batch_size, 1, flags.num_parallel_calls, flags.multi_gpu) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps) print(eval_results) if benchmark_logger: benchmark_logger.log_estimator_evaluation_result(eval_results) if flags.export_dir is not None: warn_on_multi_gpu_export(flags.multi_gpu) # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags.batch_size) classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
def run_deep_speech(_): """Run deep speech training and eval loop.""" tf.set_random_seed(flags_obj.seed) # Data preprocessing tf.logging.info("Data preprocessing...") train_speech_dataset = generate_dataset(flags_obj.train_data_dir) eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir) # Number of label classes. Label string is "[a-z]' -" num_classes = len(train_speech_dataset.speech_labels) # Use distribution strategy for multi-gpu training num_gpus = flags_core.get_num_gpus(flags_obj) distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=flags_obj.model_dir, config=run_config, params={ "num_classes": num_classes, } ) # Benchmark logging run_params = { "batch_size": flags_obj.batch_size, "train_epochs": flags_obj.train_epochs, "rnn_hidden_size": flags_obj.rnn_hidden_size, "rnn_hidden_layers": flags_obj.rnn_hidden_layers, "rnn_type": flags_obj.rnn_type, "is_bidirectional": flags_obj.is_bidirectional, "use_bias": flags_obj.use_bias } dataset_name = "LibriSpeech" benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info("deep_speech", dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) per_device_batch_size = distribution_utils.per_device_batch_size( flags_obj.batch_size, num_gpus) def input_fn_train(): return dataset.input_fn( per_device_batch_size, train_speech_dataset) def input_fn_eval(): return dataset.input_fn( per_device_batch_size, eval_speech_dataset) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1, total_training_cycle) # Perform batch_wise dataset shuffling train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle( train_speech_dataset.entries, cycle_index, flags_obj.sortagrad, flags_obj.batch_size) estimator.train(input_fn=input_fn_train, hooks=train_hooks) # Evaluation tf.logging.info("Starting to evaluate...") eval_results = evaluate_model( estimator, eval_speech_dataset.speech_labels, eval_speech_dataset.entries, input_fn_eval) # Log the WER and CER results. benchmark_logger.log_evaluation_result(eval_results) tf.logging.info( "Iteration {}: WER = {:.2f}, CER = {:.2f}".format( cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY])) # If some evaluation threshold is met if model_helpers.past_stop_threshold( flags_obj.wer_threshold, eval_results[_WER_KEY]): break
def run_Simple_model(flags_obj): """Run Simple_model training and eval loop. Args: flags_obj: An object containing parsed flag values. """ model_helpers.apply_clean(flags_obj) model_function = model_fn session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config) data_format = 'channels_first' classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'data_format': data_format, }) # Use `tf.parse_single_example()` to extract data from a `tf.Example` # protocol buffer, and perform any additional per-record preprocessing. def parser_train(record): keys_to_features = { "image_raw": tf.FixedLenFeature((), tf.string, default_value=""), "label": tf.FixedLenFeature((), tf.int64, default_value=tf.zeros([], dtype=tf.int64)), } parsed = tf.parse_single_example(record, keys_to_features) # Perform additional preprocessing on the parsed data. image = tf.image.decode_jpeg(parsed["image_raw"]) image = tf.reshape(image, [64, 64, 3]) # subtract mean mean = tf.convert_to_tensor(mean_value, dtype=tf.uint8) image = image - mean image = tf.image.random_crop(image, [48, 48, 3]) image = tf.image.random_flip_left_right(image) # flip image randomly with a 50% chance image = tf.image.random_brightness(image, max_delta=0.3) # random brightness l = [1, 2, 3] deg = random.choice(l) image = tf.image.rot90(image, k=deg) image = tf.transpose(image, perm=[2, 0, 1]) # channels first image = tf.cast(image, tf.int32) image = tf.truediv(image, 255) image = tf.cast(image, tf.float32) label = tf.cast(parsed["label"], tf.int32) return {"image": image}, label # Use `tf.parse_single_example()` to extract data from a `tf.Example` # protocol buffer, and perform any additional per-record preprocessing. def parser_eval(record): keys_to_features = { "image_raw": tf.FixedLenFeature((), tf.string, default_value=""), "label": tf.FixedLenFeature((), tf.int64, default_value=tf.zeros([], dtype=tf.int64)), } parsed = tf.parse_single_example(record, keys_to_features) # Perform additional preprocessing on the parsed data. image = tf.image.decode_jpeg(parsed["image_raw"]) image = tf.reshape(image, [64, 64, 3]) # subtract mean mean = tf.convert_to_tensor(mean_value, dtype=tf.uint8) image = image - mean image = tf.image.random_crop(image, [48, 48, 3]) image = tf.transpose(image, perm=[2, 0, 1]) # channels first image = tf.cast(image, tf.int32) image = tf.truediv(image, 255) image = tf.cast(image, tf.float32) label = tf.cast(parsed["label"], tf.int32) return {"image": image}, label # Set up training and evaluation input functions. def train_input_fn(): """Prepare data for training.""" train_tfrecord = 'dataset/train_images.tfrecords' dataset = tf.data.TFRecordDataset(train_tfrecord) # Use `Dataset.map()` to build a pair of a feature dictionary and a label # tensor for each example. dataset = dataset.map(parser_train) dataset = dataset.shuffle(buffer_size=5000) dataset = dataset.batch(flags_obj.batch_size) # Iterate through the dataset a set number (`epochs_between_evals`) of times # during each training session. dataset = dataset.repeat(flags_obj.epochs_between_evals) return dataset def eval_input_fn(): filenames = ["dataset/val_images.tfrecords"] dataset = tf.data.TFRecordDataset(filenames) # Use `Dataset.map()` to build a pair of a feature dictionary and a label # tensor for each example. dataset = dataset.map(parser_eval) dataset = dataset.batch(flags_obj.batch_size) # repeat:restarts dataset when reaches the end so here is not needed # dataset = dataset.repeat() return dataset # Set up hook that outputs training logs every 100 steps. train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) # Train and evaluate model. for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break # Export the model if flags_obj.export_dir is not None: image = tf.placeholder(tf.float32, [None, 3, 48, 48]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) classifier.export_savedmodel(flags_obj.export_dir, input_fn, strip_default_attrs=True)
def run_wide_deep(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ # Clean up the model directory if present shutil.rmtree(flags_obj.model_dir, ignore_errors=True) model = build_estimator(flags_obj.model_dir, flags_obj.model_type) train_file = os.path.join(flags_obj.data_dir, 'adult.data') test_file = os.path.join(flags_obj.data_dir, 'adult.test') # Train and evaluate the model every `flags.epochs_between_evals` epochs. def train_input_fn(): return input_fn(train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size) def eval_input_fn(): return input_fn(test_file, 1, False, flags_obj.batch_size) run_params = { 'batch_size': flags_obj.batch_size, 'train_epochs': flags_obj.train_epochs, 'model_type': flags_obj.model_type, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('wide_deep', 'Census Income', run_params, test_id=flags_obj.benchmark_test_id) loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '') train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, batch_size=flags_obj.batch_size, tensors_to_log={ 'average_loss': loss_prefix + 'head/truediv', 'loss': loss_prefix + 'head/weighted_loss/Sum' }) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): model.train(input_fn=train_input_fn, hooks=train_hooks) results = model.evaluate(input_fn=eval_input_fn) # Display evaluation metrics tf.logging.info('Results at epoch %d / %d', (n + 1) * flags_obj.epochs_between_evals, flags_obj.train_epochs) tf.logging.info('-' * 60) for key in sorted(results): tf.logging.info('%s: %s' % (key, results[key])) benchmark_logger.log_evaluation_result(results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, results['accuracy']): break # Export the model if flags_obj.export_dir is not None: export_model(model, flags_obj.model_type, flags_obj.export_dir)
def test_raise_in_invalid_names(self): invalid_names = ['StepCounterHook', 'StopAtStepHook'] with self.assertRaises(ValueError): hooks_helper.get_train_hooks(invalid_names, batch_size=256)
def net_main(flags_obj, model_function, input_function, net_data_configs, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ model_helpers.apply_clean(flags.FLAGS) is_metriclog = True if is_metriclog: metric_logfn = os.path.join(flags_obj.model_dir, 'log_metric.txt') metric_logf = open(metric_logfn, 'a') from tensorflow.contrib.memory_stats.ops import gen_memory_stats_ops max_memory_usage = gen_memory_stats_ops.max_bytes_in_use() # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config) # initialize our model with all but the dense layer from pretrained resnet if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'loss_scale': flags_core.get_loss_scale(flags_obj), 'weight_decay': flags_obj.weight_decay, 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune, 'examples_per_epoch': flags_obj.examples_per_epoch, 'net_data_configs': net_data_configs }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } dataset_name = net_data_configs['dataset_name'] if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('meshnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, num_gpus=flags_core.get_num_gpus(flags_obj), examples_per_epoch=flags_obj.examples_per_epoch, sg_settings=net_data_configs['sg_settings']) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, sg_settings=net_data_configs['sg_settings']) if flags_obj.eval_only or flags_obj.pred_ply or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [ flags_obj.epochs_between_evals for _ in range(int(n_loops)) ] schedule[-1] = flags_obj.train_epochs - sum( schedule[:-1]) # over counting. classifier.train(input_fn=lambda: input_fn_train(1), hooks=train_hooks, max_steps=10) with tf.Session() as sess: max_memory_usage_v = sess.run(max_memory_usage) tf.logging.info('\n\nmemory usage: %0.3f G\n\n' % (max_memory_usage_v * 1.0 / 1e9)) best_acc, best_acc_checkpoint = load_saved_best(flags_obj.model_dir) for cycle_index, num_train_epochs in enumerate(schedule): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) t0 = time.time() train_t = 0 if num_train_epochs: classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), hooks=train_hooks, max_steps=flags_obj.max_train_steps) train_t = (time.time() - t0) / num_train_epochs tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. only_train = False and (not flags_obj.eval_only) and ( not flags_obj.pred_ply) if not only_train: t0 = time.time() eval_results = classifier.evaluate( input_fn=input_fn_eval, steps=flags_obj.max_train_steps, ) #checkpoint_path=best_acc_checkpoint) eval_t = time.time() - t0 if flags_obj.pred_ply: pred_generator = classifier.predict(input_fn=input_fn_eval) num_classes = net_data_configs['dset_metas'].num_classes gen_pred_ply(eval_results, pred_generator, flags_obj.model_dir, num_classes) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break cur_is_best = '' if num_train_epochs and eval_results['accuracy'] > best_acc: best_acc = eval_results['accuracy'] save_cur_model_as_best_acc(flags_obj.model_dir, best_acc) cur_is_best = 'best' global_step = cur_global_step(flags_obj.model_dir) epoch = int(global_step / flags_obj.examples_per_epoch * flags_obj.num_gpus) ious_str = get_ious_str(eval_results['cm'], net_data_configs['dset_metas'], eval_results['mean_iou']) metric_logf.write( '\n{} train t:{:.1f} eval t:{:.1f} \teval acc:{:.3f} \tmean_iou:{:.3f} {} {}\n' .format(epoch, train_t, eval_t, eval_results['accuracy'], eval_results['mean_iou'], cur_is_best, ious_str)) metric_logf.flush() if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def run_ncf(_): """Run NCF training and eval loop.""" # Data preprocessing # The file name of training and test dataset train_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME) test_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME) neg_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME) assert os.path.exists(train_fname), ( "Run data_download.py first to download and extract {} dataset".format( FLAGS.dataset)) tf.logging.info("Data preprocessing...") ncf_dataset = dataset.data_preprocessing( train_fname, test_fname, neg_fname, FLAGS.num_neg) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF( ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Training and evaluation cycle def train_input_fn(): return dataset.input_fn( True, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.epochs_between_evals) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=train_input_fn, hooks=train_hooks) # Evaluate the model eval_results = evaluate_model( estimator, FLAGS.batch_size, num_gpus, ncf_dataset) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) if flags_core.get_num_gpus(flags_obj) == 0: distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0') elif flags_core.get_num_gpus(flags_obj) == 1: distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.contrib.distribute.MirroredStrategy( num_gpus=flags_core.get_num_gpus(flags_obj)) run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + "-synthetic" benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, batch_size=flags_obj.batch_size) def input_fn_train(): return input_function(is_training=True, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.epochs_between_evals) def input_fn_eval(): return input_function(is_training=False, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def resnet_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) if flags_core.get_num_gpus(flags_obj) == 0: distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0') elif flags_core.get_num_gpus(flags_obj) == 1: distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.contrib.distribute.MirroredStrategy( num_gpus=flags_core.get_num_gpus(flags_obj) ) run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info('resnet', dataset_name, run_params) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, batch_size=flags_obj.batch_size) def input_fn_train(): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.epochs_between_evals) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size) ncf_dataset = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "batch_size": batch_size, "learning_rate": FLAGS.learning_rate, "num_users": ncf_dataset.num_users, "num_items": ncf_dataset.num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) pred_input_fn = data_preprocessing.make_pred_input_fn( ncf_dataset=ncf_dataset) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) tf.gfile.DeleteRecursively(train_record_dir) # Evaluate the model eval_results = evaluate_model(eval_estimator, ncf_dataset, pred_input_fn) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # Some of the NumPy vector math can be quite large and likes to stay in # memory for a while. gc.collect() # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size) ncf_dataset = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "batch_size": batch_size, "learning_rate": FLAGS.learning_rate, "num_users": ncf_dataset.num_users, "num_items": ncf_dataset.num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) tf.gfile.DeleteRecursively(train_record_dir) # Evaluate the model eval_results = evaluate_model( eval_estimator, ncf_dataset, pred_input_fn) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # Some of the NumPy vector math can be quite large and likes to stay in # memory for a while. gc.collect() # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def resnet_main(flags_obj, model_function, input_function, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' if flags_obj.multi_gpu: validate_batch_size_for_multi_gpu(flags_obj.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( model_function, loss_reduction=tf.losses.Reduction.MEAN) # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) # Set up a RunConfig to save checkpoint and set session config. run_config = tf.estimator.RunConfig().replace( save_checkpoints_secs=1e9, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'multi_gpu': flags_obj.multi_gpu, 'version': int(flags_obj.version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) benchmark_logger = logger.config_benchmark_logger( flags_obj.benchmark_log_dir) benchmark_logger.log_run_info('resnet') train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, batch_size=flags_obj.batch_size, benchmark_log_dir=flags_obj.benchmark_log_dir) def input_fn_train(): return input_function(True, flags_obj.data_dir, flags_obj.batch_size, flags_obj.epochs_between_evals, flags_obj.num_parallel_calls, flags_obj.multi_gpu) def input_fn_eval(): return input_function(False, flags_obj.data_dir, flags_obj.batch_size, 1, flags_obj.num_parallel_calls, flags_obj.multi_gpu) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: warn_on_multi_gpu_export(flags_obj.multi_gpu) # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def model_fn(features, mode, params): # 기존 dict 의 key,value 에 사용자 입력 값을 추가함 # extend dict values to defaultdict _params = Transformer_params.copy() for k in params: v = params[k] _params[k] = v params = _params if mode == tf.estimator.ModeKeys.PREDICT: features['answer'] = None # define transformer transformer = Transformer(params, (mode == tf.estimator.ModeKeys.TRAIN)) logits = transformer(features['question'], features['answer']) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: # 네트워크 출력 logits 와 실제 answer 간의 loss 를 계산 xentropy, weights = metrics.padded_cross_entropy_loss( logits, features['answer'], params["label_smoothing"], params["vocab_size"]) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # loss 를 minimize learning_rate = get_learning_rate(params['learning_rate'], params['hidden_size'], params['learning_rate_warmup_steps']) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate, beta1=params['optimizer_adam_beta1'], beta2=params['optimizer_adam_beta2'], epsilon=params['optimizer_adam_epsilon']) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) # 매 100번 마다 logitmax 과 answer 값을 보여줌 logging_hook = tf.train.LoggingTensorHook( { "logitmax": tf.argmax(logits[0], -1), "answer": features['answer'][0] }, every_n_iter=100) # 여러가지 metric 을 계산하여 보여줌 (accuracy, BLEU score, ..) eval_metric_ops = metrics.get_eval_metrics(logits, features['answer'], params) tensors_to_log = {} for k in eval_metric_ops: tensors_to_log[k.split('/')[-1]] = eval_metric_ops[k][1].name tf.summary.scalar(k.split('/')[-1], eval_metric_ops[k][1]) tensors_to_log = {'learning_rate': learning_rate} tf.summary.scalar('learning_rate', learning_rate) train_hooks = hooks_helper.get_train_hooks( ['LoggingTensorHook'], model_dir=params['model_dir'], tensors_to_log=tensors_to_log, batch_size=params['batch_size'], use_tpu=params["use_tpu"]) # train if mode == tf.estimator.ModeKeys.TRAIN: return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, predictions=logits, training_hooks=[logging_hook] + train_hooks, eval_metric_ops=eval_metric_ops) # evaluate elif mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode, loss=loss, predictions=logits, eval_metric_ops=eval_metric_ops) # predict else: # predict 시에도 summary 저장 summary_hook = tf.train.SummarySaverHook( save_secs=1000, output_dir='./output/ckpt/pred', scaffold=tf.train.Scaffold(summary_op=tf.summary.merge_all())) return tf.estimator.EstimatorSpec(mode, predictions=logits, prediction_hooks=[summary_hook])
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ model_helpers.apply_clean(flags.FLAGS) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config) # initialize our model with all but the dense layer from pretrained resnet if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, num_gpus=flags_core.get_num_gpus(flags_obj), dtype=flags_core.get_tf_dtype(flags_obj)) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) if flags_obj.eval_only or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [ flags_obj.epochs_between_evals for _ in range(int(n_loops)) ] schedule[-1] = flags_obj.train_epochs - sum( schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, num_cycles=total_training_cycle, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int(np.ceil( FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) params = { "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, "use_estimator": FLAGS.use_estimator, } if FLAGS.use_estimator: train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, iterations=num_train_steps, params=params, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) else: runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps, num_eval_steps, FLAGS.use_while_loop) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"} ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) eval_input_fn = None target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model if FLAGS.use_estimator: train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if eval_input_fn is None: eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") else: runner.train() tf.logging.info("Beginning evaluation.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = runner.eval() tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_mnist(flags_obj): model_helpers.apply_clean(flags_obj) model_function = model_fn # 通过命令 "with tf.device('/cpu:0'):",允许手动设置操作运行的设备。如果手动设置的设备不存在或者不可用,就会导致tf程序等待或异常 # allow_soft_placement, 自动选择可用设备。 session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), all_reduce_alg=flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config) data_format = flags_obj.data_format if data_format is None: # data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') data_format = 'channels_last' mnist_classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={'data_format': data_format}) def train_input_fn(): ds = dataset.train(flags_obj.data_dir) ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size) # repeat的功能就是将整个序列重复多次,主要用来处理机器学习中的epoch ds = ds.repeat(flags_obj.epochs_between_evals) return ds def eval_input_fn(): # make_one_shot_iterator # Creates an `Iterator` for enumerating the elements of this dataset return dataset.test(flags_obj.data_dir).batch( flags_obj.batch_size).make_one_shot_iterator().get_next() train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: image = tf.placeholder(tf.float32, [None, 28, 28]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn( {'image': image}) mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn, strip_default_attrs=True)
def resnet_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. Returns: Dict of results of the run. """ model_helpers.apply_clean(flags.FLAGS) # Ensures flag override logic is only executed if explicitly triggered. if flags_obj.tf_gpu_thread_mode: override_flags_and_set_envars_for_gpu_thread_pool(flags_obj) # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) # Creates a `RunConfig` that checkpoints every 24 hours which essentially # results in checkpoints determined only by `epochs_between_evals`. run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config, save_checkpoints_secs=60*60*24) # Initializes model with all but the dense layer from pretrained ResNet. if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, dtype=flags_core.get_tf_dtype(flags_obj), datasets_num_private_threads=flags_obj.datasets_num_private_threads, num_parallel_batches=flags_obj.datasets_num_parallel_batches) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) if flags_obj.eval_only or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))] schedule[-1] = flags_obj.train_epochs - sum(schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. export_dtype = flags_core.get_tf_dtype(flags_obj) if flags_obj.image_bytes_as_serving_input: input_receiver_fn = functools.partial( image_bytes_serving_input_fn, shape, dtype=export_dtype) else: input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size, dtype=export_dtype) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True) return eval_results
def run_ncf(_): """Run NCF training and eval loop.""" # Data preprocessing # The file name of training and test dataset train_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME) test_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME) neg_fname = os.path.join(FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME) assert os.path.exists(train_fname), ( "Run data_download.py first to download and extract {} dataset".format( FLAGS.dataset)) tf.logging.info("Data preprocessing...") ncf_dataset = dataset.data_preprocessing(train_fname, test_fname, neg_fname, FLAGS.num_neg) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF(ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Training and evaluation cycle def train_input_fn(): return dataset.input_fn( True, per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.epochs_between_evals) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=train_input_fn, hooks=train_hooks) # Evaluate the model eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus, ncf_dataset) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()