def test_data_preprocessing(self): ncf_dataset = movielens_dataset.data_preprocessing( self.temp_dir, movielens.ML_1M, _NUM_NEG) # Check train data preprocessing self.assertAllEqual( np.array(ncf_dataset.train_data)[:, 2], np.full(len(ncf_dataset.train_data), 1)) self.assertEqual(ncf_dataset.num_users, 2) self.assertEqual(ncf_dataset.num_items, 175) # Check test dataset test_dataset = ncf_dataset.all_eval_data first_true_item = test_dataset[100] self.assertEqual(first_true_item[1], ncf_dataset.eval_true_items[0]) self.assertEqual(first_true_item[1], ncf_dataset.eval_all_items[0][-1]) last_gt_item = test_dataset[-1] self.assertEqual(last_gt_item[1], ncf_dataset.eval_true_items[-1]) self.assertEqual(last_gt_item[1], ncf_dataset.eval_all_items[-1][-1]) test_list = test_dataset.tolist() first_test_items = [x[1] for x in test_list if x[0] == 0] self.assertAllEqual(first_test_items, ncf_dataset.eval_all_items[0]) last_test_items = [x[1] for x in test_list if x[0] == 1] self.assertAllEqual(last_test_items, ncf_dataset.eval_all_items[-1])
def test_generate_train_dataset(self): # Check train dataset ncf_dataset = movielens_dataset.data_preprocessing( self.temp_dir, movielens.ML_1M, _NUM_NEG) train_dataset = movielens_dataset.generate_train_dataset( ncf_dataset.train_data, ncf_dataset.num_items, _NUM_NEG) # Each user has 1 positive instance followed by _NUM_NEG negative instances train_data_0 = train_dataset[0] self.assertEqual(train_data_0[2], 1) for i in range(1, _NUM_NEG + 1): train_data = train_dataset[i] self.assertEqual(train_data_0[0], train_data[0]) self.assertNotEqual(train_data_0[1], train_data[1]) self.assertEqual(0, train_data[2]) train_data_last = train_dataset[-1 - _NUM_NEG] self.assertEqual(train_data_last[2], 1) for i in range(-1, -_NUM_NEG): train_data = train_dataset[i] self.assertEqual(train_data_last[0], train_data[0]) self.assertNotEqual(train_data_last[1], train_data[1]) self.assertEqual(0, train_data[2])
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) movielens_dataset.construct_train_eval_csv(data_dir=FLAGS.data_dir, dataset=FLAGS.dataset) tf.logging.info("Data preprocessing...") ncf_dataset = movielens_dataset.data_preprocessing(FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg) model_helpers.apply_clean(flags.FLAGS) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF(ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Training and evaluation cycle def get_train_input_fn(): return movielens_dataset.get_input_fn( True, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals) def get_pred_input_fn(): return movielens_dataset.get_input_fn( False, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks) # Evaluate the model eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn()) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) movielens_dataset.construct_train_eval_csv( data_dir=FLAGS.data_dir, dataset=FLAGS.dataset) tf.logging.info("Data preprocessing...") ncf_dataset = movielens_dataset.data_preprocessing( FLAGS.data_dir, FLAGS.dataset, FLAGS.num_neg) model_helpers.apply_clean(flags.FLAGS) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF( ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Training and evaluation cycle def get_train_input_fn(): return movielens_dataset.get_input_fn( True, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, FLAGS.epochs_between_evals) def get_pred_input_fn(): return movielens_dataset.get_input_fn( False, distribution_utils.per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.data_dir, FLAGS.dataset, 1) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=get_train_input_fn(), hooks=train_hooks) # Evaluate the model eval_results = evaluate_model( estimator, FLAGS.batch_size, num_gpus, ncf_dataset, get_pred_input_fn()) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()