def _build_model(self, params, num_steps, is_training): """Builds the NCF model. Args: params: A dict of hyperparameters. is_training: If True, build the training model. If False, build the evaluation model. Returns: A _TrainModelProperties if is_training is True, or an _EvalModelProperties otherwise. """ record_files_placeholder = tf.placeholder(tf.string, ()) input_fn, _, _ = \ data_preprocessing.make_input_fn( ncf_dataset=self._ncf_dataset, is_training=is_training, record_files=record_files_placeholder) dataset = input_fn(params) iterator = dataset.make_initializable_iterator() model_fn = neumf_model.neumf_model_fn if params["use_xla_for_gpu"]: model_fn = xla.estimator_model_fn(model_fn) if is_training: return self._build_train_specific_graph( iterator, model_fn, params, record_files_placeholder, num_steps) else: return self._build_eval_specific_graph( iterator, model_fn, params, record_files_placeholder, num_steps)
def _build_model(self, params, is_training): """Builds the NCF model. Args: params: A dict of hyperparameters. is_training: If True, build the training model. If False, build the evaluation model. Returns: A _TrainModelProperties if is_training is True, or an _EvalModelProperties otherwise. """ record_files_placeholder = tf.placeholder(tf.string, ()) input_fn, _, _ = \ data_preprocessing.make_input_fn( ncf_dataset=self._ncf_dataset, is_training=is_training, record_files=record_files_placeholder) dataset = input_fn(params) iterator = dataset.make_initializable_iterator() model_fn = neumf_model.neumf_model_fn if params["use_xla_for_gpu"]: model_fn = xla.estimator_model_fn(model_fn) if is_training: features, labels = iterator.get_next() estimator_spec = model_fn(features, labels, tf.estimator.ModeKeys.TRAIN, params) with tf.control_dependencies([estimator_spec.train_op]): run_model_op = self._global_step.assign_add(1) return self._TrainModelProperties(record_files_placeholder, iterator, estimator_spec.loss, params["batch_size"], run_model_op) else: features = iterator.get_next() estimator_spec = model_fn(features, None, tf.estimator.ModeKeys.EVAL, params) run_model_op = tf.group( *(update_op for _, update_op in estimator_spec.eval_metric_ops.values())) metric_initializer = tf.variables_initializer( tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)) return self._EvalModelProperties(record_files_placeholder, iterator, estimator_spec.loss, params["eval_batch_size"], run_model_op, estimator_spec.eval_metric_ops, metric_initializer)
def test_end_to_end(self): ncf_dataset, _ = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, num_cycles=1, num_data_readers=2, num_neg=NUM_NEG) g = tf.Graph() with g.as_default(): input_fn, record_dir, batch_count = \ data_preprocessing.make_input_fn(ncf_dataset, True) dataset = input_fn({ "batch_size": BATCH_SIZE, "use_tpu": False, "use_xla_for_gpu": False }) first_epoch = self.drain_dataset(dataset=dataset, g=g) user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()} item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()} train_examples = { True: set(), False: set(), } for features, labels in first_epoch: for u, i, l in zip(features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], labels): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l assert i_raw == self.holdout[u_raw][1] train_examples[l].add((u_raw, i_raw)) num_positives_seen = len(train_examples[True]) assert ncf_dataset.num_train_positives == num_positives_seen # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
def test_end_to_end(self): ncf_dataset, _ = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, num_cycles=1, num_data_readers=2, num_neg=NUM_NEG) g = tf.Graph() with g.as_default(): input_fn, record_dir, batch_count = \ data_preprocessing.make_input_fn(ncf_dataset, True) dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False, "use_xla_for_gpu": False}) first_epoch = self.drain_dataset(dataset=dataset, g=g) user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()} item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()} train_examples = { True: set(), False: set(), } for features, labels in first_epoch: for u, i, l in zip(features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], labels): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l assert i_raw == self.holdout[u_raw][1] train_examples[l].add((u_raw, i_raw)) num_positives_seen = len(train_examples[True]) assert ncf_dataset.num_train_positives == num_positives_seen # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
def build_graph(params, ncf_dataset, mode): """Build graph_spec with graph and some useful handles.""" tf.logging.info("building graph for mode {}.".format(mode)) with tf.Graph().as_default() as graph: embedding = get_embedding(params, mode) tf.logging.info("tpu_embedding_config_proto: {}.".format( embedding.config_proto)) if mode == tpu_embedding.INFERENCE: assert (params["batch_size"] % (embedding.num_cores * (1 + rconst.NUM_EVAL_NEGATIVES))) == 0 input_fn, train_record_dir, batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=(mode == tpu_embedding.TRAINING)) get_infeed_thread_fn, infeed_queue = (build_infeed( input_fn, params, batch_count, embedding, mode)) tpu_loop = build_tpu_loop(infeed_queue, params, batch_count, embedding, mode) def run_tpu_loop(sess, epoch): if mode == tpu_embedding.TRAINING: sess.run(tpu_loop) else: total_values, count_values = (sess.run(tpu_loop)) hr = np.sum(total_values) / np.sum(count_values) tf.logging.info("HR = {} after epoch {}.".format(hr, epoch)) hook_before, hook_after = build_hooks(mode, embedding, params, train_record_dir) return GraphSpec(graph, embedding, run_tpu_loop, get_infeed_thread_fn, hook_before, hook_after)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, num_cycles=total_training_cycle, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int(np.ceil( FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) params = { "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, "use_estimator": FLAGS.use_estimator, } if FLAGS.use_estimator: train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, iterations=num_train_steps, params=params, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) else: runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps, num_eval_steps, FLAGS.use_while_loop) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"} ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) eval_input_fn = None target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model if FLAGS.use_estimator: train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if eval_input_fn is None: eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") else: runner.train() tf.logging.info("Beginning evaluation.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = runner.eval() tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int( np.ceil(FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int( np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) pred_input_fn = None total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if pred_input_fn is None: pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(pred_input_fn, steps=num_eval_steps) hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) tf.logging.info("Evaluation complete.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)