def construct_estimator(model_dir, params): """Construct either an Estimator or TPUEstimator for NCF. Args: model_dir: The model directory for the estimator params: The params dict for the estimator Returns: An Estimator or TPUEstimator. """ distribution = ncf_common.get_distribution_strategy(params) run_config = tf.estimator.RunConfig(train_distribute=distribution, eval_distribute=distribution) model_fn = neumf_model.neumf_model_fn if params["use_xla_for_gpu"]: # TODO(seemuch): remove the contrib imput from tensorflow.contrib.compiler import xla LOGGING.info("Using XLA for GPU for training and evaluation.") model_fn = xla.estimator_model_fn(model_fn) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, config=run_config, params=params) return estimator
def run_ncf(_): """Run NCF training and eval with Keras.""" # TODO(seemuch): Support different train and eval batch sizes if FLAGS.eval_batch_size != FLAGS.batch_size: tf.logging.warning( "The Keras implementation of NCF currently does not support batch_size " "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match " "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size)) FLAGS.eval_batch_size = FLAGS.batch_size params = ncf_common.parse_flags(FLAGS) batch_size = params["batch_size"] # ncf_common rounds eval_batch_size (this is needed due to a reshape during # eval). This carries over that rounding to batch_size as well. params['batch_size'] = params['eval_batch_size'] num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) params["num_users"], params["num_items"] = num_users, num_items producer.start() model_helpers.apply_clean(flags.FLAGS) batches_per_step = params["batches_per_step"] train_input_dataset, eval_input_dataset = _get_train_and_eval_data( producer, params) # It is required that for distributed training, the dataset must call # batch(). The parameter of batch() here is the number of replicas involed, # such that each replica evenly gets a slice of data. train_input_dataset = train_input_dataset.batch(batches_per_step) eval_input_dataset = eval_input_dataset.batch(batches_per_step) strategy = ncf_common.get_distribution_strategy(params) with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = ncf_common.get_optimizer(params) time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) keras_model.compile(loss=_keras_loss, metrics=[_get_metric_fn(params)], optimizer=optimizer) history = keras_model.fit( train_input_dataset, epochs=FLAGS.train_epochs, callbacks=[IncrementEpochCallback(producer), time_callback], verbose=2) tf.logging.info("Training done. Start evaluating") eval_results = keras_model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) tf.logging.info("Keras evaluation is done.") stats = build_stats(history, eval_results, time_callback) return stats
def construct_estimator(model_dir, params): """Construct either an Estimator or TPUEstimator for NCF. Args: model_dir: The model directory for the estimator params: The params dict for the estimator Returns: An Estimator or TPUEstimator. """ distribution = ncf_common.get_distribution_strategy(params) run_config = tf.estimator.RunConfig(train_distribute=distribution, eval_distribute=distribution) model_fn = neumf_model.neumf_model_fn if params["use_xla_for_gpu"]: tf.logging.info("Using XLA for GPU for training and evaluation.") model_fn = xla.estimator_model_fn(model_fn) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, config=run_config, params=params) return estimator
def run_ncf(_): """Run NCF training and eval with Keras.""" # TODO(seemuch): Support different train and eval batch sizes if FLAGS.eval_batch_size != FLAGS.batch_size: logging.warning( "The Keras implementation of NCF currently does not support batch_size " "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match " "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size) ) FLAGS.eval_batch_size = FLAGS.batch_size params = ncf_common.parse_flags(FLAGS) if params["keras_use_ctl"] and int(tf.__version__.split(".")[0]) == 1: logging.error( "Custom training loop only works with tensorflow 2.0 and above.") return # ncf_common rounds eval_batch_size (this is needed due to a reshape during # eval). This carries over that rounding to batch_size as well. This is the # per device batch size params["batch_size"] = params["eval_batch_size"] batch_size = params["batch_size"] num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) params["num_users"], params["num_items"] = num_users, num_items producer.start() model_helpers.apply_clean(flags.FLAGS) batches_per_step = params["batches_per_step"] train_input_dataset, eval_input_dataset = _get_train_and_eval_data(producer, params) # It is required that for distributed training, the dataset must call # batch(). The parameter of batch() here is the number of replicas involed, # such that each replica evenly gets a slice of data. train_input_dataset = train_input_dataset.batch(batches_per_step) eval_input_dataset = eval_input_dataset.batch(batches_per_step) time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) per_epoch_callback = IncrementEpochCallback(producer) callbacks = [per_epoch_callback, time_callback] if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_metric_fn", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) strategy = ncf_common.get_distribution_strategy(params) with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if params["keras_use_ctl"]: loss_object = tf.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.SUM, from_logits=True) train_input_iterator = strategy.make_dataset_iterator(train_input_dataset) eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset) @tf.function def train_step(): """Called once per step to train the model.""" def step_fn(inputs): """Computes loss and applied gradient per replica.""" features, labels = inputs with tf.GradientTape() as tape: softmax_logits = keras_model(features) loss = loss_object(labels, softmax_logits, sample_weight=features[rconst.VALID_POINT_MASK]) loss *= (1.0 / (batch_size*strategy.num_replicas_in_sync)) grads = tape.gradient(loss, keras_model.trainable_variables) optimizer.apply_gradients(list(zip(grads, keras_model.trainable_variables))) return loss per_replica_losses = strategy.experimental_run(step_fn, train_input_iterator) mean_loss = strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss @tf.function def eval_step(): """Called once per eval step to compute eval metrics.""" def step_fn(inputs): """Computes eval metrics per replica.""" features, _ = inputs softmax_logits = keras_model(features) in_top_k, metric_weights = metric_fn( softmax_logits, features[rconst.DUPLICATE_MASK], params) hr_sum = tf.reduce_sum(in_top_k*metric_weights) hr_count = tf.reduce_sum(metric_weights) return hr_sum, hr_count per_replica_hr_sum, per_replica_hr_count = ( strategy.experimental_run(step_fn, eval_input_iterator)) hr_sum = strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None) hr_count = strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None) return hr_sum, hr_count time_callback.on_train_begin() for epoch in range(FLAGS.train_epochs): per_epoch_callback.on_epoch_begin(epoch) train_input_iterator.initialize() train_loss = 0 for step in range(num_train_steps): time_callback.on_batch_begin(step+epoch*num_train_steps) train_loss += train_step() time_callback.on_batch_end(step+epoch*num_train_steps) train_loss /= num_train_steps logging.info("Done training epoch %s, epoch loss=%s.", epoch+1, train_loss) eval_input_iterator.initialize() hr_sum = 0 hr_count = 0 for _ in range(num_eval_steps): step_hr_sum, step_hr_count = eval_step() hr_sum += step_hr_sum hr_count += step_hr_count logging.info("Done eval epoch %s, hr=%s.", epoch+1, hr_sum/hr_count) if (FLAGS.early_stopping and float(hr_sum/hr_count) > params["hr_threshold"]): break time_callback.on_train_end() eval_results = [None, hr_sum/hr_count] else: with distribution_utils.get_strategy_scope(strategy): keras_model.compile(optimizer=optimizer) history = keras_model.fit(train_input_dataset, steps_per_epoch=num_train_steps, epochs=FLAGS.train_epochs, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_results = keras_model.evaluate( eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def run_ncf(_): """Run NCF training and eval with Keras.""" # TODO(seemuch): Support different train and eval batch sizes if FLAGS.eval_batch_size != FLAGS.batch_size: logging.warning( "The Keras implementation of NCF currently does not support batch_size " "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match " "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size) ) FLAGS.eval_batch_size = FLAGS.batch_size params = ncf_common.parse_flags(FLAGS) batch_size = params["batch_size"] # ncf_common rounds eval_batch_size (this is needed due to a reshape during # eval). This carries over that rounding to batch_size as well. params['batch_size'] = params['eval_batch_size'] num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) params["num_users"], params["num_items"] = num_users, num_items producer.start() model_helpers.apply_clean(flags.FLAGS) batches_per_step = params["batches_per_step"] train_input_dataset, eval_input_dataset = _get_train_and_eval_data(producer, params) # It is required that for distributed training, the dataset must call # batch(). The parameter of batch() here is the number of replicas involed, # such that each replica evenly gets a slice of data. train_input_dataset = train_input_dataset.batch(batches_per_step) eval_input_dataset = eval_input_dataset.batch(batches_per_step) strategy = ncf_common.get_distribution_strategy(params) with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) keras_model.compile( loss=_keras_loss, metrics=[_get_metric_fn(params)], optimizer=optimizer) history = keras_model.fit(train_input_dataset, epochs=FLAGS.train_epochs, callbacks=[ IncrementEpochCallback(producer), time_callback], verbose=2) logging.info("Training done. Start evaluating") eval_results = keras_model.evaluate( eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") stats = build_stats(history, eval_results, time_callback) return stats