def get_inputs(params): """Returns some parameters used by the model.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] return num_users, num_items, num_train_steps, num_eval_steps, producer
def prepare_raw_data(flag_obj): """Downloads and prepares raw data for data generation.""" movielens.download(flag_obj.dataset, flag_obj.data_dir) data_processing_params = { "train_epochs": flag_obj.num_train_epochs, "batch_size": flag_obj.prebatch_size, "eval_batch_size": flag_obj.prebatch_size, "batches_per_step": 1, "stream_files": True, "num_neg": flag_obj.num_negative_samples, } num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=flag_obj.dataset, data_dir=flag_obj.data_dir, params=data_processing_params, constructor_type=flag_obj.constructor_type, epoch_dir=flag_obj.data_dir, generate_data_offline=True) # pylint: disable=protected-access input_metadata = { "num_users": num_users, "num_items": num_items, "constructor_type": flag_obj.constructor_type, "num_train_elements": producer._elements_in_epoch, "num_eval_elements": producer._eval_elements_in_epoch, "num_train_epochs": flag_obj.num_train_epochs, "prebatch_size": flag_obj.prebatch_size, } # pylint: enable=protected-access return producer, input_metadata
def get_inputs(params): """Returns some parameters used by the model.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] return num_users, num_items, num_train_steps, num_eval_steps, producer
def test_end_to_end(self): ncf_dataset = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2, num_neg=NUM_NEG) for _ in range(30): if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): break time.sleep(1) # allow `alive` file to be written g = tf.Graph() with g.as_default(): input_fn, record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset) dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False}) first_epoch = self.drain_dataset(dataset=dataset, g=g) user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()} item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()} train_examples = { True: set(), False: set(), } for features, labels in first_epoch: for u, i, l in zip(features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], labels): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l assert i_raw == self.holdout[u_raw][1] train_examples[l].add((u_raw, i_raw)) num_positives_seen = len(train_examples[True]) # The numbers don't match exactly because the last batch spills over into # the next epoch assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
def test_end_to_end(self): ncf_dataset, _ = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, num_cycles=1, num_data_readers=2, num_neg=NUM_NEG) g = tf.Graph() with g.as_default(): input_fn, record_dir, batch_count = \ data_preprocessing.make_input_fn(ncf_dataset, True) dataset = input_fn({ "batch_size": BATCH_SIZE, "use_tpu": False, "use_xla_for_gpu": False }) first_epoch = self.drain_dataset(dataset=dataset, g=g) user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()} item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()} train_examples = { True: set(), False: set(), } for features, labels in first_epoch: for u, i, l in zip(features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], labels): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l assert i_raw == self.holdout[u_raw][1] train_examples[l].add((u_raw, i_raw)) num_positives_seen = len(train_examples[True]) assert ncf_dataset.num_train_positives == num_positives_seen # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
def test_end_to_end(self): ncf_dataset = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2, num_neg=NUM_NEG) for _ in range(30): if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): break time.sleep(1) # allow `alive` file to be written g = tf.Graph() with g.as_default(): input_fn, record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset) dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False}) first_epoch = self.drain_dataset(dataset=dataset, g=g) user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()} item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()} train_examples = { True: set(), False: set(), } for features, labels in first_epoch: for u, i, l in zip(features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], labels): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l assert i_raw == self.holdout[u_raw][1] train_examples[l].add((u_raw, i_raw)) num_positives_seen = len(train_examples[True]) # The numbers don't match exactly because the last batch spills over into # the next epoch assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
def test_end_to_end(self): ncf_dataset, _ = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, num_cycles=1, num_data_readers=2, num_neg=NUM_NEG) g = tf.Graph() with g.as_default(): input_fn, record_dir, batch_count = \ data_preprocessing.make_input_fn(ncf_dataset, True) dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False, "use_xla_for_gpu": False}) first_epoch = self.drain_dataset(dataset=dataset, g=g) user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()} item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()} train_examples = { True: set(), False: set(), } for features, labels in first_epoch: for u, i, l in zip(features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], labels): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l assert i_raw == self.holdout[u_raw][1] train_examples[l].add((u_raw, i_raw)) num_positives_seen = len(train_examples[True]) assert ncf_dataset.num_train_positives == num_positives_seen # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
def main(_): """Train NCF model and evaluate its hit rate (HR) metric.""" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) master = tpu_cluster_resolver.master() ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, # TODO(shizhiw): support multihost. batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.eval_batch_size, num_neg=FLAGS.num_neg, num_cycles=_NUM_EPOCHS, epochs_per_cycle=1, match_mlperf=FLAGS.ml_perf, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) train_params, eval_params = create_params(ncf_dataset) eval_graph_spec = build_graph(eval_params, ncf_dataset, tpu_embedding.INFERENCE) for epoch in range(_NUM_EPOCHS): tf.logging.info("Training {}...".format(epoch)) # build training graph each epoch as number of batches per epoch # i.e. batch_count might change by 1 between epochs. train_graph_spec = build_graph(train_params, ncf_dataset, tpu_embedding.TRAINING) run_graph(master, train_graph_spec, epoch) tf.logging.info("Evaluating {}...".format(epoch)) run_graph(master, eval_graph_spec, epoch) cleanup_fn() # Cleanup data construction artifacts and subprocess.
def _test_fresh_randomness(self, constructor_type): train_epochs = 5 params = self.make_params(train_epochs=train_epochs) _, _, producer = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, params=params, constructor_type=constructor_type, deterministic=True) producer.start() results = [] g = tf.Graph() with g.as_default(): for _ in range(train_epochs): input_fn = producer.make_input_fn(is_training=True) dataset = input_fn(params) results.extend(self.drain_dataset(dataset=dataset, g=g)) producer.join() assert producer._fatal_exception is None positive_counts, negative_counts = defaultdict(int), defaultdict(int) md5 = hashlib.md5() for features, labels in results: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.VALID_POINT_MASK], labels ] for i in data_list: md5.update(i.tobytes()) for u, i, v, l in zip(*data_list): if not v: continue # ignore padding if l: positive_counts[(u, i)] += 1 else: negative_counts[(u, i)] += 1 self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5) # The positive examples should appear exactly once each epoch self.assertAllEqual(list(positive_counts.values()), [train_epochs for _ in positive_counts]) # The threshold for the negatives is heuristic, but in general repeats are # expected, but should not appear too frequently. pair_cardinality = NUM_USERS * NUM_ITEMS neg_pair_cardinality = pair_cardinality - len(self.seen_pairs) # Approximation for the expectation number of times that a particular # negative will appear in a given epoch. Implicit in this calculation is the # treatment of all negative pairs as equally likely. Normally is not # necessarily reasonable; however the generation in self.setUp() will # approximate this behavior sufficiently for heuristic testing. e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality # The frequency of occurance of a given negative pair should follow an # approximately binomial distribution in the limit that the cardinality of # the negative pair set >> number of samples per epoch. approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs + 1), n=train_epochs, p=e_sample) # Tally the actual observed counts. count_distribution = [0 for _ in range(train_epochs + 1)] for i in negative_counts.values(): i = min([i, train_epochs]) # round down tail for simplicity. count_distribution[i] += 1 count_distribution[0] = neg_pair_cardinality - sum( count_distribution[1:]) # Check that the frequency of negative pairs is approximately binomial. for i in range(train_epochs + 1): if approx_pdf[i] < 0.05: continue # Variance will be high at the tails. observed_fraction = count_distribution[i] / neg_pair_cardinality deviation = (2 * abs(observed_fraction - approx_pdf[i]) / (observed_fraction + approx_pdf[i])) self.assertLess(deviation, 0.2)
def _test_end_to_end(self, constructor_type): params = self.make_params(train_epochs=1) _, _, producer = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, params=params, constructor_type=constructor_type, deterministic=True) producer.start() producer.join() assert producer._fatal_exception is None user_inv_map = {v: k for k, v in producer.user_map.items()} item_inv_map = {v: k for k, v in producer.item_map.items()} # ========================================================================== # == Training Data ========================================================= # ========================================================================== g = tf.Graph() with g.as_default(): input_fn = producer.make_input_fn(is_training=True) dataset = input_fn(params) first_epoch = self.drain_dataset(dataset=dataset, g=g) counts = defaultdict(int) train_examples = { True: set(), False: set(), } md5 = hashlib.md5() for features, labels in first_epoch: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.VALID_POINT_MASK], labels] for i in data_list: md5.update(i.tobytes()) for u, i, v, l in zip(*data_list): if not v: continue # ignore padding u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l self.assertEqual(i_raw, self.holdout[u_raw][1]) train_examples[l].add((u_raw, i_raw)) counts[(u_raw, i_raw)] += 1 self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5) num_positives_seen = len(train_examples[True]) self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen) # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. self.assertGreater( len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9) # This checks that the samples produced are independent by checking the # number of duplicate entries. If workers are not properly independent there # will be lots of repeated pairs. self.assertLess(np.mean(list(counts.values())), 1.1) # ========================================================================== # == Eval Data ============================================================= # ========================================================================== with g.as_default(): input_fn = producer.make_input_fn(is_training=False) dataset = input_fn(params) eval_data = self.drain_dataset(dataset=dataset, g=g) current_user = None md5 = hashlib.md5() for features in eval_data: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.DUPLICATE_MASK]] for i in data_list: md5.update(i.tobytes()) for idx, (u, i, d) in enumerate(zip(*data_list)): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if current_user is None: current_user = u # Ensure that users appear in blocks, as the evaluation logic expects # this structure. self.assertEqual(u, current_user) # The structure of evaluation data is 999 negative examples followed # by the holdout positive. if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1): # Check that the last element in each chunk is the holdout item. self.assertEqual(i_raw, self.holdout[u_raw][1]) current_user = None elif i_raw == self.holdout[u_raw][1]: # Because the holdout item is not given to the negative generation # process, it can appear as a negative. In that case, it should be # masked out as a duplicate. (Since the true positive is placed at # the end and would therefore lose the tie.) assert d else: # Otherwise check that the other 999 points for a user are selected # from the negatives. assert (u_raw, i_raw) not in self.seen_pairs self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)
def _test_end_to_end(self, constructor_type): params = self.make_params(train_epochs=1) _, _, producer = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, params=params, constructor_type=constructor_type, deterministic=True) producer.start() producer.join() assert producer._fatal_exception is None user_inv_map = {v: k for k, v in producer.user_map.items()} item_inv_map = {v: k for k, v in producer.item_map.items()} # ========================================================================== # == Training Data ========================================================= # ========================================================================== g = tf.Graph() with g.as_default(): input_fn = producer.make_input_fn(is_training=True) dataset = input_fn(params) first_epoch = self.drain_dataset(dataset=dataset, g=g) counts = defaultdict(int) train_examples = { True: set(), False: set(), } md5 = hashlib.md5() for features, labels in first_epoch: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.VALID_POINT_MASK], labels ] for i in data_list: md5.update(i.tobytes()) for u, i, v, l in zip(*data_list): if not v: continue # ignore padding u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l self.assertEqual(i_raw, self.holdout[u_raw][1]) train_examples[l].add((u_raw, i_raw)) counts[(u_raw, i_raw)] += 1 self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5) num_positives_seen = len(train_examples[True]) self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen) # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. self.assertGreater( len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9) # This checks that the samples produced are independent by checking the # number of duplicate entries. If workers are not properly independent there # will be lots of repeated pairs. self.assertLess(np.mean(list(counts.values())), 1.1) # ========================================================================== # == Eval Data ============================================================= # ========================================================================== with g.as_default(): input_fn = producer.make_input_fn(is_training=False) dataset = input_fn(params) eval_data = self.drain_dataset(dataset=dataset, g=g) current_user = None md5 = hashlib.md5() for features in eval_data: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.DUPLICATE_MASK] ] for i in data_list: md5.update(i.tobytes()) for idx, (u, i, d) in enumerate(zip(*data_list)): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if current_user is None: current_user = u # Ensure that users appear in blocks, as the evaluation logic expects # this structure. self.assertEqual(u, current_user) # The structure of evaluation data is 999 negative examples followed # by the holdout positive. if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1): # Check that the last element in each chunk is the holdout item. self.assertEqual(i_raw, self.holdout[u_raw][1]) current_user = None elif i_raw == self.holdout[u_raw][1]: # Because the holdout item is not given to the negative generation # process, it can appear as a negative. In that case, it should be # masked out as a duplicate. (Since the true positive is placed at # the end and would therefore lose the tie.) assert d else: # Otherwise check that the other 999 points for a user are selected # from the negatives. assert (u_raw, i_raw) not in self.seen_pairs self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, num_cycles=total_training_cycle, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int(np.ceil( FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) params = { "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, "use_estimator": FLAGS.use_estimator, } if FLAGS.use_estimator: train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, iterations=num_train_steps, params=params, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) else: runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps, num_eval_steps, FLAGS.use_while_loop) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"} ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) eval_input_fn = None target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model if FLAGS.use_estimator: train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if eval_input_fn is None: eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") else: runner.train() tf.logging.info("Beginning evaluation.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = runner.eval() tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def _test_fresh_randomness(self, constructor_type): train_epochs = 5 params = self.make_params(train_epochs=train_epochs) _, _, producer = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, params=params, constructor_type=constructor_type, deterministic=True) producer.start() results = [] g = tf.Graph() with g.as_default(): for _ in range(train_epochs): input_fn = producer.make_input_fn(is_training=True) dataset = input_fn(params) results.extend(self.drain_dataset(dataset=dataset, g=g)) producer.join() assert producer._fatal_exception is None positive_counts, negative_counts = defaultdict(int), defaultdict(int) md5 = hashlib.md5() for features, labels in results: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.VALID_POINT_MASK], labels] for i in data_list: md5.update(i.tobytes()) for u, i, v, l in zip(*data_list): if not v: continue # ignore padding if l: positive_counts[(u, i)] += 1 else: negative_counts[(u, i)] += 1 self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5) # The positive examples should appear exactly once each epoch self.assertAllEqual(list(positive_counts.values()), [train_epochs for _ in positive_counts]) # The threshold for the negatives is heuristic, but in general repeats are # expected, but should not appear too frequently. pair_cardinality = NUM_USERS * NUM_ITEMS neg_pair_cardinality = pair_cardinality - len(self.seen_pairs) # Approximation for the expectation number of times that a particular # negative will appear in a given epoch. Implicit in this calculation is the # treatment of all negative pairs as equally likely. Normally is not # necessarily reasonable; however the generation in self.setUp() will # approximate this behavior sufficiently for heuristic testing. e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality # The frequency of occurance of a given negative pair should follow an # approximately binomial distribution in the limit that the cardinality of # the negative pair set >> number of samples per epoch. approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs+1), n=train_epochs, p=e_sample) # Tally the actual observed counts. count_distribution = [0 for _ in range(train_epochs + 1)] for i in negative_counts.values(): i = min([i, train_epochs]) # round down tail for simplicity. count_distribution[i] += 1 count_distribution[0] = neg_pair_cardinality - sum(count_distribution[1:]) # Check that the frequency of negative pairs is approximately binomial. for i in range(train_epochs + 1): if approx_pdf[i] < 0.05: continue # Variance will be high at the tails. observed_fraction = count_distribution[i] / neg_pair_cardinality deviation = (2 * abs(observed_fraction - approx_pdf[i]) / (observed_fraction + approx_pdf[i])) self.assertLess(deviation, 0.2)
def main(_): """Train NCF model and evaluate its hit rate (HR) metric.""" params = create_params() if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, epoch_dir=os.path.join(params["model_dir"], "epoch"), params=get_params_for_dataset(params), constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] producer.start() params["num_users"] = num_users params["num_items"] = num_items feature_columns = create_feature_columns(params) model_fn = create_model_fn(feature_columns) estimator = create_tpu_estimator(model_fn, feature_columns, params) train_hooks = hooks_helper.get_train_hooks( ["ProfilerHook"], model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) for cycle_index in range(FLAGS.train_epochs): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, FLAGS.train_epochs)) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) producer.stop_loop() producer.join()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "learning_rate": FLAGS.learning_rate, "num_users": ncf_dataset.num_users, "num_items": ncf_dataset.num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) pred_input_fn = data_preprocessing.make_pred_input_fn( ncf_dataset=ncf_dataset) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") eval_results = eval_estimator.evaluate(pred_input_fn) tf.logging.info("Evaluation complete.") # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[rconst.HR_KEY] ndcg = eval_results[rconst.NDCG_KEY] tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) params = parse_flags(FLAGS) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] producer.start() params["num_users"], params["num_items"] = num_users, num_items model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"]) target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_TARGET, value={"epoch": cycle_index, "value": FLAGS.hr_threshold}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={"epoch": cycle_index, "value": hr}) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES}) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size) ncf_dataset = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "batch_size": batch_size, "learning_rate": FLAGS.learning_rate, "num_users": ncf_dataset.num_users, "num_items": ncf_dataset.num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) pred_input_fn = data_preprocessing.make_pred_input_fn( ncf_dataset=ncf_dataset) total_training_cycle = 1 if FLAGS.inference_only else FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) if not FLAGS.inference_only: # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) tf.gfile.DeleteRecursively(train_record_dir) # Evaluate the model eval_results = evaluate_model(eval_estimator, ncf_dataset, pred_input_fn) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.fatal("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # Export SavedModel if FLAGS.export_savedmodel: eval_estimator.export_savedmodel(FLAGS.model_dir, serving_input_receiver_fn) print("SavedModel successfully exported to: {}/<timestamp>".format( FLAGS.model_dir)) # Some of the NumPy vector math can be quite large and likes to stay in # memory for a while. gc.collect() # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_batch_size = int(FLAGS.eval_batch_size or max([FLAGS.batch_size, eval_per_user])) if eval_batch_size % eval_per_user: eval_batch_size = eval_batch_size // eval_per_user * eval_per_user tf.logging.warning( "eval examples per user does not evenly divide eval_batch_size. " "Overriding to {}".format(eval_batch_size)) if FLAGS.use_synthetic_data: ncf_dataset = None cleanup_fn = lambda: None num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH else: ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf, deterministic=FLAGS.seed is not None, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) num_users = ncf_dataset.num_users num_items = ncf_dataset.num_items num_train_steps = int( np.ceil(FLAGS.epochs_between_evals * ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) / FLAGS.batch_size)) num_eval_steps = int( np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users / eval_batch_size)) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "use_seed": FLAGS.seed is not None, "hash_pipeline": FLAGS.hash_pipeline, "batch_size": batch_size, "eval_batch_size": eval_batch_size, "learning_rate": FLAGS.learning_rate, "num_users": num_users, "num_items": num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "num_neg": FLAGS.num_neg, "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, "beta1": FLAGS.beta1, "beta2": FLAGS.beta2, "epsilon": FLAGS.epsilon, "match_mlperf": FLAGS.ml_perf, "use_xla_for_gpu": FLAGS.use_xla_for_gpu, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size, # for ExamplesPerSecondHook tensors_to_log={"cross_entropy": "cross_entropy"}) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) pred_input_fn = None total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=True) if batch_count != num_train_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format(batch_count, num_train_steps)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) if train_record_dir: tf.gfile.DeleteRecursively(train_record_dir) tf.logging.info("Beginning evaluation.") if pred_input_fn is None: pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn( ncf_dataset=ncf_dataset, is_training=False) if eval_batch_count != num_eval_steps: raise ValueError( "Step counts do not match. ({} vs. {}) The async process is " "producing incorrect shards.".format( eval_batch_count, num_eval_steps)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = eval_estimator.evaluate(pred_input_fn, steps=num_eval_steps) hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) tf.logging.info("Evaluation complete.") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) # Logged by the async process during record creation. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, deferred=True) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) cleanup_fn() # Cleanup data construction artifacts and subprocess. # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) num_gpus = flags_core.get_num_gpus(FLAGS) batch_size = distribution_utils.per_device_batch_size( int(FLAGS.batch_size), num_gpus) eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size) ncf_dataset = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, batch_size=batch_size, eval_batch_size=eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=FLAGS.epochs_between_evals, match_mlperf=FLAGS.ml_perf) model_helpers.apply_clean(flags.FLAGS) train_estimator, eval_estimator = construct_estimator( num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={ "batch_size": batch_size, "learning_rate": FLAGS.learning_rate, "num_users": ncf_dataset.num_users, "num_items": ncf_dataset.num_items, "mf_dim": FLAGS.num_factors, "model_layers": [int(layer) for layer in FLAGS.layers], "mf_regularization": FLAGS.mf_regularization, "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization], "use_tpu": FLAGS.tpu is not None, "tpu": FLAGS.tpu, "tpu_zone": FLAGS.tpu_zone, "tpu_gcp_project": FLAGS.tpu_gcp_project, }, batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, model_dir=FLAGS.model_dir, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "eval_batch_size": eval_batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params, test_id=FLAGS.benchmark_test_id) approx_train_steps = int(ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) // FLAGS.batch_size) pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model train_input_fn, train_record_dir, batch_count = \ data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) if np.abs(approx_train_steps - batch_count) > 1: tf.logging.warning( "Estimated ({}) and reported ({}) number of batches differ by more " "than one".format(approx_train_steps, batch_count)) train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=batch_count) tf.gfile.DeleteRecursively(train_record_dir) # Evaluate the model eval_results = evaluate_model( eval_estimator, ncf_dataset, pred_input_fn) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # Some of the NumPy vector math can be quite large and likes to stay in # memory for a while. gc.collect() # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run_ncf(_): """Run NCF training and eval loop.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) params = parse_flags(FLAGS) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] producer.start() params["num_users"], params["num_items"] = num_users, num_items model_helpers.apply_clean(flags.FLAGS) estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params) benchmark_logger, train_hooks = log_and_get_hooks( params["eval_batch_size"]) target_reached = False mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP) for cycle_index in range(total_training_cycle): assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH, value=cycle_index) train_input_fn = producer.make_input_fn(is_training=True) estimator.train(input_fn=train_input_fn, hooks=train_hooks, steps=num_train_steps) tf.logging.info("Beginning evaluation.") eval_input_fn = producer.make_input_fn(is_training=False) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START, value=cycle_index) eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps) tf.logging.info("Evaluation complete.") hr = float(eval_results[rconst.HR_KEY]) ndcg = float(eval_results[rconst.NDCG_KEY]) loss = float(eval_results["loss"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET, value={ "epoch": cycle_index, "value": FLAGS.hr_threshold }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY, value={ "epoch": cycle_index, "value": hr }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG, value={ "epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES }) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format( cycle_index + 1, hr, ndcg, loss)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): target_reached = True break mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP, value={"success": target_reached}) producer.stop_loop() producer.join() # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session() mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)