def run_movie(flags_obj): """Construct all necessary functions and call run_loop. Args: flags_obj: Object containing user specified flags. """ if flags_obj.download_if_missing: movielens.download(dataset=flags_obj.dataset, data_dir=flags_obj.data_dir) train_input_fn, eval_input_fn, model_column_fn = \ movielens_dataset.construct_input_fns( dataset=flags_obj.dataset, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, repeat=flags_obj.epochs_between_evals) tensors_to_log = { 'loss': '{loss_prefix}head/weighted_loss/value' } wide_deep_run_loop.run_loop( name="MovieLens", train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, model_column_fn=model_column_fn, build_estimator_fn=build_estimator, flags_obj=flags_obj, tensors_to_log=tensors_to_log, early_stop=False)
def get_inputs(params): """Returns some parameters used by the model.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] return num_users, num_items, num_train_steps, num_eval_steps, producer
def run_movie(flags_obj): """Construct all necessary functions and call run_loop. Args: flags_obj: Object containing user specified flags. """ if flags_obj.download_if_missing: movielens.download(dataset=flags_obj.dataset, data_dir=flags_obj.data_dir) train_input_fn, eval_input_fn, model_column_fn = \ movielens_dataset.construct_input_fns( dataset=flags_obj.dataset, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, repeat=flags_obj.epochs_between_evals) tensors_to_log = {'loss': '{loss_prefix}head/weighted_loss/value'} wide_deep_run_loop.run_loop(name="MovieLens", train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, model_column_fn=model_column_fn, build_estimator_fn=build_estimator, flags_obj=flags_obj, tensors_to_log=tensors_to_log, early_stop=False)
def prepare_raw_data(flag_obj): """Downloads and prepares raw data for data generation.""" movielens.download(flag_obj.dataset, flag_obj.data_dir) data_processing_params = { "train_epochs": flag_obj.num_train_epochs, "batch_size": flag_obj.prebatch_size, "eval_batch_size": flag_obj.prebatch_size, "batches_per_step": 1, "stream_files": True, "num_neg": flag_obj.num_negative_samples, } num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=flag_obj.dataset, data_dir=flag_obj.data_dir, params=data_processing_params, constructor_type=flag_obj.constructor_type, epoch_dir=flag_obj.data_dir, generate_data_offline=True) # pylint: disable=protected-access input_metadata = { "num_users": num_users, "num_items": num_items, "constructor_type": flag_obj.constructor_type, "num_train_elements": producer._elements_in_epoch, "num_eval_elements": producer._eval_elements_in_epoch, "num_train_epochs": flag_obj.num_train_epochs, "prebatch_size": flag_obj.prebatch_size, } # pylint: enable=protected-access return producer, input_metadata
def main(_): movielens.download(dataset=flags.FLAGS.dataset, data_dir=flags.FLAGS.data_dir) construct_input_fns(flags.FLAGS.dataset, flags.FLAGS.data_dir)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) params = parse_flags(FLAGS) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] producer.start() params["num_users"], params["num_items"] = num_users, num_items model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"]) target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int( np.ceil(FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int( np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) pred_input_fn = None total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if pred_input_fn is None: pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(pred_input_fn, steps=num_eval_steps) hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) tf.logging.info("Evaluation complete.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size) ncf_dataset = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "batch_size": batch_size, "learning_rate": FLAGS.learning_rate, "num_users": ncf_dataset.num_users, "num_items": ncf_dataset.num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) tf.gfile.DeleteRecursively(train_record_dir) # Evaluate the model eval_results = evaluate_model( eval_estimator, ncf_dataset, pred_input_fn) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # Some of the NumPy vector math can be quite large and likes to stay in # memory for a while. gc.collect() # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) params = parse_flags(FLAGS) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] producer.start() params["num_users"], params["num_items"] = num_users, num_items model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks( params["eval_batch_size"]) target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size) ncf_dataset = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "batch_size": batch_size, "learning_rate": FLAGS.learning_rate, "num_users": ncf_dataset.num_users, "num_items": ncf_dataset.num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) pred_input_fn = data_preprocessing.make_pred_input_fn( ncf_dataset=ncf_dataset) total_training_cycle = 1 if FLAGS.inference_only else FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) if not FLAGS.inference_only: # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) tf.gfile.DeleteRecursively(train_record_dir) # Evaluate the model eval_results = evaluate_model(eval_estimator, ncf_dataset, pred_input_fn) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.fatal("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # Export SavedModel if FLAGS.export_savedmodel: eval_estimator.export_savedmodel(FLAGS.model_dir, serving_input_receiver_fn) print("SavedModel successfully exported to: {}/<timestamp>".format( FLAGS.model_dir)) # Some of the NumPy vector math can be quite large and likes to stay in # memory for a while. gc.collect() # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, num_cycles=total_training_cycle, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int(np.ceil( FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) params = { "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, "use_estimator": FLAGS.use_estimator, } if FLAGS.use_estimator: train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, iterations=num_train_steps, params=params, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) else: runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps, num_eval_steps, FLAGS.use_while_loop) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"} ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) eval_input_fn = None target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model if FLAGS.use_estimator: train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if eval_input_fn is None: eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") else: runner.train() tf.logging.info("Beginning evaluation.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = runner.eval() tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) movielens_dataset.construct_train_eval_csv(data_dir=FLAGS.data_dir, dataset=FLAGS.dataset) tf.logging.info("Data preprocessing...") ncf_dataset = movielens_dataset.data_preprocessing(FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg) model_helpers.apply_clean(flags.FLAGS) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF(ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Training and evaluation cycle def get_train_input_fn(): return movielens_dataset.get_input_fn( True, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals) def get_pred_input_fn(): return movielens_dataset.get_input_fn( False, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks) # Evaluate the model eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn()) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "learning_rate": FLAGS.learning_rate, "num_users": ncf_dataset.num_users, "num_items": ncf_dataset.num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) pred_input_fn = data_preprocessing.make_pred_input_fn( ncf_dataset=ncf_dataset) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") eval_results = eval_estimator.evaluate(pred_input_fn) tf.logging.info("Evaluation complete.") # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[rconst.HR_KEY] ndcg = eval_results[rconst.NDCG_KEY] tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def main(_): movielens.download(dataset=flags.FLAGS.dataset, data_dir=flags.FLAGS.data_dir) construct_train_eval_csv(flags.FLAGS.data_dir, flags.FLAGS.dataset)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) movielens_dataset.construct_train_eval_csv( data_dir=FLAGS.data_dir, dataset=FLAGS.dataset) tf.logging.info("Data preprocessing...") ncf_dataset = movielens_dataset.data_preprocessing( FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg) model_helpers.apply_clean(flags.FLAGS) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF( ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Training and evaluation cycle def get_train_input_fn(): return movielens_dataset.get_input_fn( True, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals) def get_pred_input_fn(): return movielens_dataset.get_input_fn( False, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks) # Evaluate the model eval_results = evaluate_model( estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn()) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, num_data_readers=None, num_neg=4, epochs_per_cycle=1): """Preprocess data and start negative generation subprocess.""" movielens.download(dataset=dataset, data_dir=data_dir) tf.logging.info("Beginning data preprocessing.") ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir, num_data_readers=num_data_readers) tf.logging.info("Creating training file subprocess.") subproc_env = os.environ.copy() # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU # resources and by default will try to allocate GPU memory. This would cause # contention with the main training process. subproc_env["CUDA_VISIBLE_DEVICES"] = "" python = "python3" if six.PY3 else "python2" # By limiting the number of workers we guarantee that the worker # pool underlying the training generation doesn't starve other processes. num_workers = int(multiprocessing.cpu_count() * 0.75) subproc_args = [ python, _ASYNC_GEN_PATH, "--data_dir", data_dir, "--cache_id", str(ncf_dataset.cache_paths.cache_id), "--num_neg", str(num_neg), "--num_train_positives", str(ncf_dataset.num_train_positives), "--num_items", str(ncf_dataset.num_items), "--num_readers", str(ncf_dataset.num_data_readers), "--epochs_per_cycle", str(epochs_per_cycle), "--train_batch_size", str(batch_size), "--eval_batch_size", str(eval_batch_size), "--num_workers", str(num_workers), "--spillover", "True", # This allows the training input function to # guarantee batch size and significantly improves # performance. (~5% increase in examples/sec on # GPU, and needed for TPU XLA.) "--redirect_logs", "True", "--seed", str(int(stat_utils.random_int32())) ] tf.logging.info("Generation subprocess command: {}".format( " ".join(subproc_args))) proc = subprocess.Popen(args=subproc_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, env=subproc_env) atexit.register(_shutdown, proc=proc) atexit.register(tf.gfile.DeleteRecursively, ncf_dataset.cache_paths.cache_root) return ncf_dataset