Exemplo n.º 1
0
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
                         num_cycles, num_data_readers=None, num_neg=4,
                         epochs_per_cycle=1, match_mlperf=False,
                         deterministic=False, use_subprocess=True,
                         cache_id=None):
  # type: (...) -> (NCFDataset, typing.Callable)
  """Preprocess data and start negative generation subprocess."""

  tf.logging.info("Beginning data preprocessing.")
  tf.gfile.MakeDirs(data_dir)
  ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir,
                                num_data_readers=num_data_readers,
                                match_mlperf=match_mlperf,
                                deterministic=deterministic,
                                cache_id=cache_id)
  # By limiting the number of workers we guarantee that the worker
  # pool underlying the training generation doesn't starve other processes.
  num_workers = int(multiprocessing.cpu_count() * 0.75) or 1

  flags_ = {
      "data_dir": data_dir,
      "cache_id": ncf_dataset.cache_paths.cache_id,
      "num_neg": num_neg,
      "num_train_positives": ncf_dataset.num_train_positives,
      "num_items": ncf_dataset.num_items,
      "num_users": ncf_dataset.num_users,
      "num_readers": ncf_dataset.num_data_readers,
      "epochs_per_cycle": epochs_per_cycle,
      "num_cycles": num_cycles,
      "train_batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "num_workers": num_workers,
      "redirect_logs": use_subprocess,
      "use_tf_logging": not use_subprocess,
      "ml_perf": match_mlperf,
      "output_ml_perf_compliance_logging": mlperf_helper.LOGGER.enabled,
  }

  if use_subprocess:
    tf.logging.info("Creating training file subprocess.")
    subproc_env = os.environ.copy()
    # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
    # resources and by default will try to allocate GPU memory. This would cause
    # contention with the main training process.
    subproc_env["CUDA_VISIBLE_DEVICES"] = ""
    subproc_args = popen_helper.INVOCATION + [
        "--data_dir", data_dir,
        "--cache_id", str(ncf_dataset.cache_paths.cache_id)]
    tf.logging.info(
        "Generation subprocess command: {}".format(" ".join(subproc_args)))
    proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env)

  cleanup_called = {"finished": False}
  @atexit.register
  def cleanup():
    """Remove files and subprocess from data generation."""
    if cleanup_called["finished"]:
      return

    if use_subprocess:
      _shutdown(proc)

    try:
      tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root)
    except tf.errors.NotFoundError:
      pass

    cleanup_called["finished"] = True

  for _ in range(300):
    if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
      break
    time.sleep(1)  # allow `alive` file to be written
  if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
    raise ValueError("Generation subprocess did not start correctly. Data will "
                     "not be available; exiting to avoid waiting forever.")

  # We start the async process and wait for it to signal that it is alive. It
  # will then enter a loop waiting for the flagfile to be written. Once we see
  # that the async process has signaled that it is alive, we clear the system
  # caches and begin the run.
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_CLEAR_CACHES)
  mlperf_helper.clear_system_caches()
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_START)
  write_flagfile(flags_, ncf_dataset)

  return ncf_dataset, cleanup
Exemplo n.º 2
0
def _filter_index_sort(raw_rating_path, match_mlperf):
  # type: (str, bool) -> (pd.DataFrame, dict, dict)
  """Read in data CSV, and output structured data.

  This function reads in the raw CSV of positive items, and performs three
  preprocessing transformations:

  1)  Filter out all users who have not rated at least a certain number
      of items. (Typically 20 items)

  2)  Zero index the users and items such that the largest user_id is
      `num_users - 1` and the largest item_id is `num_items - 1`

  3)  Sort the dataframe by user_id, with timestamp as a secondary sort key.
      This allows the dataframe to be sliced by user in-place, and for the last
      item to be selected simply by calling the `-1` index of a user's slice.

  While all of these transformations are performed by Pandas (and are therefore
  single-threaded), they only take ~2 minutes, and the overhead to apply a
  MapReduce pattern to parallel process the dataset adds significant complexity
  for no computational gain. For a larger dataset parallelizing this
  preprocessing could yield speedups. (Also, this preprocessing step is only
  performed once for an entire run.

  Args:
    raw_rating_path: The path to the CSV which contains the raw dataset.
    match_mlperf: If True, change the sorting algorithm to match the MLPerf
      reference implementation.

  Returns:
    A filtered, zero-index remapped, sorted dataframe, a dict mapping raw user
    IDs to regularized user IDs, and a dict mapping raw item IDs to regularized
    item IDs.
  """
  with tf.gfile.Open(raw_rating_path) as f:
    df = pd.read_csv(f)

  # Get the info of users who have more than 20 ratings on items
  grouped = df.groupby(movielens.USER_COLUMN)
  df = grouped.filter(
      lambda x: len(x) >= rconst.MIN_NUM_RATINGS) # type: pd.DataFrame

  original_users = df[movielens.USER_COLUMN].unique()
  original_items = df[movielens.ITEM_COLUMN].unique()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_MIN_RATINGS,
                          value=rconst.MIN_NUM_RATINGS)

  # Map the ids of user and item to 0 based index for following processing
  tf.logging.info("Generating user_map and item_map...")
  user_map = {user: index for index, user in enumerate(original_users)}
  item_map = {item: index for index, item in enumerate(original_items)}

  df[movielens.USER_COLUMN] = df[movielens.USER_COLUMN].apply(
      lambda user: user_map[user])
  df[movielens.ITEM_COLUMN] = df[movielens.ITEM_COLUMN].apply(
      lambda item: item_map[item])

  num_users = len(original_users)
  num_items = len(original_items)

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_NUM_EVAL,
                          value=rconst.NUM_EVAL_NEGATIVES)
  mlperf_helper.ncf_print(
      key=mlperf_helper.TAGS.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT,
      value=match_mlperf)

  assert num_users <= np.iinfo(np.int32).max
  assert num_items <= np.iinfo(np.uint16).max
  assert df[movielens.USER_COLUMN].max() == num_users - 1
  assert df[movielens.ITEM_COLUMN].max() == num_items - 1

  # This sort is used to shard the dataframe by user, and later to select
  # the last item for a user to be used in validation.
  tf.logging.info("Sorting by user, timestamp...")

  if match_mlperf:
    # This sort is equivalent to the non-MLPerf sort, except that the order of
    # items with the same user and timestamp are sometimes different. For some
    # reason, this sort results in a better hit-rate during evaluation, matching
    # the performance of the MLPerf reference implementation.
    df.sort_values(by=movielens.TIMESTAMP_COLUMN, inplace=True)
    df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
                   inplace=True, kind="mergesort")
  else:
    df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
                   inplace=True)

  df = df.reset_index()  # The dataframe does not reconstruct indicies in the
  # sort or filter steps.

  return df, user_map, item_map
def instantiate_pipeline(dataset,
                         data_dir,
                         batch_size,
                         eval_batch_size,
                         num_data_readers=None,
                         num_neg=4,
                         epochs_per_cycle=1,
                         match_mlperf=False,
                         deterministic=False,
                         use_subprocess=True,
                         cache_id=None):
    # type: (...) -> (NCFDataset, typing.Callable)
    """Preprocess data and start negative generation subprocess."""

    tf.logging.info("Beginning data preprocessing.")
    tf.gfile.MakeDirs(data_dir)
    ncf_dataset = construct_cache(dataset=dataset,
                                  data_dir=data_dir,
                                  num_data_readers=num_data_readers,
                                  match_mlperf=match_mlperf,
                                  deterministic=deterministic,
                                  cache_id=cache_id)
    # By limiting the number of workers we guarantee that the worker
    # pool underlying the training generation doesn't starve other processes.
    num_workers = int(multiprocessing.cpu_count() * 0.75) or 1

    flags_ = {
        "data_dir": data_dir,
        "cache_id": ncf_dataset.cache_paths.cache_id,
        "num_neg": num_neg,
        "num_train_positives": ncf_dataset.num_train_positives,
        "num_items": ncf_dataset.num_items,
        "num_users": ncf_dataset.num_users,
        "num_readers": ncf_dataset.num_data_readers,
        "epochs_per_cycle": epochs_per_cycle,
        "train_batch_size": batch_size,
        "eval_batch_size": eval_batch_size,
        "num_workers": num_workers,
        "redirect_logs": use_subprocess,
        "use_tf_logging": not use_subprocess,
        "ml_perf": match_mlperf,
        "output_ml_perf_compliance_logging": mlperf_helper.LOGGER.enabled,
    }

    if use_subprocess:
        tf.logging.info("Creating training file subprocess.")
        subproc_env = os.environ.copy()
        # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
        # resources and by default will try to allocate GPU memory. This would cause
        # contention with the main training process.
        subproc_env["CUDA_VISIBLE_DEVICES"] = ""
        subproc_args = popen_helper.INVOCATION + [
            "--data_dir", data_dir, "--cache_id",
            str(ncf_dataset.cache_paths.cache_id)
        ]
        tf.logging.info("Generation subprocess command: {}".format(
            " ".join(subproc_args)))
        proc = subprocess.Popen(args=subproc_args,
                                shell=False,
                                env=subproc_env)

    cleanup_called = {"finished": False}

    @atexit.register
    def cleanup():
        """Remove files and subprocess from data generation."""
        if cleanup_called["finished"]:
            return

        if use_subprocess:
            _shutdown(proc)

        try:
            tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root)
        except tf.errors.NotFoundError:
            pass

        cleanup_called["finished"] = True

    for _ in range(300):
        if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
            break
        time.sleep(1)  # allow `alive` file to be written
    if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
        raise ValueError(
            "Generation subprocess did not start correctly. Data will "
            "not be available; exiting to avoid waiting forever.")

    # We start the async process and wait for it to signal that it is alive. It
    # will then enter a loop waiting for the flagfile to be written. Once we see
    # that the async process has signaled that it is alive, we clear the system
    # caches and begin the run.
    mlperf_helper.clear_system_caches()
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_START)
    write_flagfile(flags_, ncf_dataset)

    return ncf_dataset, cleanup
def _filter_index_sort(raw_rating_path, match_mlperf):
    # type: (str, bool) -> (pd.DataFrame, dict, dict)
    """Read in data CSV, and output structured data.

  This function reads in the raw CSV of positive items, and performs three
  preprocessing transformations:

  1)  Filter out all users who have not rated at least a certain number
      of items. (Typically 20 items)

  2)  Zero index the users and items such that the largest user_id is
      `num_users - 1` and the largest item_id is `num_items - 1`

  3)  Sort the dataframe by user_id, with timestamp as a secondary sort key.
      This allows the dataframe to be sliced by user in-place, and for the last
      item to be selected simply by calling the `-1` index of a user's slice.

  While all of these transformations are performed by Pandas (and are therefore
  single-threaded), they only take ~2 minutes, and the overhead to apply a
  MapReduce pattern to parallel process the dataset adds significant complexity
  for no computational gain. For a larger dataset parallelizing this
  preprocessing could yield speedups. (Also, this preprocessing step is only
  performed once for an entire run.

  Args:
    raw_rating_path: The path to the CSV which contains the raw dataset.
    match_mlperf: If True, change the sorting algorithm to match the MLPerf
      reference implementation.

  Returns:
    A filtered, zero-index remapped, sorted dataframe, a dict mapping raw user
    IDs to regularized user IDs, and a dict mapping raw item IDs to regularized
    item IDs.
  """
    with tf.gfile.Open(raw_rating_path) as f:
        df = pd.read_csv(f)

    # Get the info of users who have more than 20 ratings on items
    grouped = df.groupby(movielens.USER_COLUMN)
    df = grouped.filter(
        lambda x: len(x) >= rconst.MIN_NUM_RATINGS)  # type: pd.DataFrame

    original_users = df[movielens.USER_COLUMN].unique()
    original_items = df[movielens.ITEM_COLUMN].unique()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_MIN_RATINGS,
                            value=rconst.MIN_NUM_RATINGS)

    # Map the ids of user and item to 0 based index for following processing
    tf.logging.info("Generating user_map and item_map...")
    user_map = {user: index for index, user in enumerate(original_users)}
    item_map = {item: index for index, item in enumerate(original_items)}

    df[movielens.USER_COLUMN] = df[movielens.USER_COLUMN].apply(
        lambda user: user_map[user])
    df[movielens.ITEM_COLUMN] = df[movielens.ITEM_COLUMN].apply(
        lambda item: item_map[item])

    num_users = len(original_users)
    num_items = len(original_items)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_NUM_EVAL,
                            value=num_users * (1 + rconst.NUM_EVAL_NEGATIVES))
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT,
        value=match_mlperf)

    assert num_users <= np.iinfo(np.int32).max
    assert num_items <= np.iinfo(np.uint16).max
    assert df[movielens.USER_COLUMN].max() == num_users - 1
    assert df[movielens.ITEM_COLUMN].max() == num_items - 1

    # This sort is used to shard the dataframe by user, and later to select
    # the last item for a user to be used in validation.
    tf.logging.info("Sorting by user, timestamp...")

    if match_mlperf:
        # This sort is equivalent to the non-MLPerf sort, except that the order of
        # items with the same user and timestamp are sometimes different. For some
        # reason, this sort results in a better hit-rate during evaluation, matching
        # the performance of the MLPerf reference implementation.
        df.sort_values(by=movielens.TIMESTAMP_COLUMN, inplace=True)
        df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
                       inplace=True,
                       kind="mergesort")
    else:
        df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
                       inplace=True)

    df = df.reset_index()  # The dataframe does not reconstruct indicies in the
    # sort or filter steps.

    return df, user_map, item_map
Exemplo n.º 5
0
def construct_model(user_input, item_input, params, need_strip=False):
    # type: (tf.Tensor, tf.Tensor, dict) -> tf.keras.Model
    """Initialize NeuMF model.

  Args:
    user_input: keras input layer for users
    item_input: keras input layer for items
    params: Dict of hyperparameters.
  Raises:
    ValueError: if the first model layer is not even.
  Returns:
    model:  a keras Model for computing the logits
  """
    num_users = params["num_users"]
    num_items = params["num_items"]

    model_layers = params["model_layers"]

    mf_regularization = params["mf_regularization"]
    mlp_reg_layers = params["mlp_reg_layers"]

    mf_dim = params["mf_dim"]

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MF_DIM,
                            value=mf_dim)
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MLP_LAYER_SIZES,
                            value=model_layers)

    if model_layers[0] % 2 != 0:
        raise ValueError("The first layer size should be multiple of 2!")

    # Initializer for embedding layers
    embedding_initializer = "glorot_uniform"

    if need_strip:
        batch_size = params["batch_size"]

        user_input_reshaped = tf.keras.layers.Lambda(
            lambda x: _strip_first_and_last_dimension(x, batch_size))(
                user_input)

        item_input_reshaped = tf.keras.layers.Lambda(
            lambda x: _strip_first_and_last_dimension(x, batch_size))(
                item_input)

    # It turns out to be significantly more effecient to store the MF and MLP
    # embedding portions in the same table, and then slice as needed.
    mf_slice_fn = lambda x: x[:, :mf_dim]
    mlp_slice_fn = lambda x: x[:, mf_dim:]
    embedding_user = tf.keras.layers.Embedding(
        num_users,
        mf_dim + model_layers[0] // 2,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
        input_length=1,
        name="embedding_user")(
            user_input_reshaped if need_strip else user_input)

    embedding_item = tf.keras.layers.Embedding(
        num_items,
        mf_dim + model_layers[0] // 2,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
        input_length=1,
        name="embedding_item")(
            item_input_reshaped if need_strip else item_input)

    # GMF part
    mf_user_latent = tf.keras.layers.Lambda(
        mf_slice_fn, name="embedding_user_mf")(embedding_user)
    mf_item_latent = tf.keras.layers.Lambda(
        mf_slice_fn, name="embedding_item_mf")(embedding_item)

    # MLP part
    mlp_user_latent = tf.keras.layers.Lambda(
        mlp_slice_fn, name="embedding_user_mlp")(embedding_user)
    mlp_item_latent = tf.keras.layers.Lambda(
        mlp_slice_fn, name="embedding_item_mlp")(embedding_item)

    # Element-wise multiply
    mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])

    # Concatenation of two latent features
    mlp_vector = tf.keras.layers.concatenate(
        [mlp_user_latent, mlp_item_latent])

    num_layer = len(model_layers)  # Number of layers in the MLP
    for layer in xrange(1, num_layer):
        model_layer = tf.keras.layers.Dense(
            model_layers[layer],
            kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]),
            activation="relu")
        mlp_vector = model_layer(mlp_vector)

    # Concatenate GMF and MLP parts
    predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector])

    # Final prediction layer
    logits = tf.keras.layers.Dense(
        1,
        activation=None,
        kernel_initializer="lecun_uniform",
        name=movielens.RATING_COLUMN)(predict_vector)

    # Print model topology.
    model = tf.keras.models.Model([user_input, item_input], logits)
    model.summary()
    sys.stdout.flush()

    return model
Exemplo n.º 6
0
def construct_model(users, items, params):
    # type: (tf.Tensor, tf.Tensor, dict) -> tf.Tensor
    """Initialize NeuMF model.

  Args:
    users: Tensor of user ids.
    items: Tensor of item ids.
    params: Dict of hyperparameters.

  Raises:
    ValueError: if the first model layer is not even.

  Returns:
    logits:  network logits
  """

    num_users = params["num_users"]
    num_items = params["num_items"]

    model_layers = params["model_layers"]

    mf_regularization = params["mf_regularization"]
    mlp_reg_layers = params["mlp_reg_layers"]

    mf_dim = params["mf_dim"]

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MF_DIM,
                            value=mf_dim)
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MLP_LAYER_SIZES,
                            value=model_layers)

    if model_layers[0] % 2 != 0:
        raise ValueError("The first layer size should be multiple of 2!")

    # Input variables
    user_input = tf.keras.layers.Input(tensor=users)
    item_input = tf.keras.layers.Input(tensor=items)
    batch_size = user_input.get_shape()[0]

    if params["use_tpu"]:
        with tf.variable_scope("embed_weights", reuse=tf.AUTO_REUSE):
            cmb_embedding_user = tf.get_variable(
                name="embeddings_mf_user",
                shape=[num_users, mf_dim + model_layers[0] // 2],
                initializer=tf.glorot_uniform_initializer())

            cmb_embedding_item = tf.get_variable(
                name="embeddings_mf_item",
                shape=[num_items, mf_dim + model_layers[0] // 2],
                initializer=tf.glorot_uniform_initializer())

            cmb_user_latent = tf.keras.layers.Lambda(
                lambda ids: tf.gather(cmb_embedding_user, ids))(user_input)

            cmb_item_latent = tf.keras.layers.Lambda(
                lambda ids: tf.gather(cmb_embedding_item, ids))(item_input)

            mlp_user_latent = tf.keras.layers.Lambda(lambda x: tf.slice(
                x, [0, 0], [batch_size, model_layers[0] // 2]))(
                    cmb_user_latent)

            mlp_item_latent = tf.keras.layers.Lambda(lambda x: tf.slice(
                x, [0, 0], [batch_size, model_layers[0] // 2]))(
                    cmb_item_latent)

            mf_user_latent = tf.keras.layers.Lambda(lambda x: tf.slice(
                x, [0, model_layers[0] // 2], [batch_size, mf_dim]))(
                    cmb_user_latent)

            mf_item_latent = tf.keras.layers.Lambda(lambda x: tf.slice(
                x, [0, model_layers[0] // 2], [batch_size, mf_dim]))(
                    cmb_item_latent)

    else:
        # Initializer for embedding layers
        embedding_initializer = "glorot_uniform"

        # Embedding layers of GMF and MLP
        mf_embedding_user = tf.keras.layers.Embedding(
            num_users,
            mf_dim,
            embeddings_initializer=embedding_initializer,
            embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
            input_length=1)
        mf_embedding_item = tf.keras.layers.Embedding(
            num_items,
            mf_dim,
            embeddings_initializer=embedding_initializer,
            embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
            input_length=1)

        mlp_embedding_user = tf.keras.layers.Embedding(
            num_users,
            model_layers[0] // 2,
            embeddings_initializer=embedding_initializer,
            embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
            input_length=1)
        mlp_embedding_item = tf.keras.layers.Embedding(
            num_items,
            model_layers[0] // 2,
            embeddings_initializer=embedding_initializer,
            embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
            input_length=1)

        # GMF part
        mf_user_latent = mf_embedding_user(user_input)
        mf_item_latent = mf_embedding_item(item_input)

        # MLP part
        mlp_user_latent = mlp_embedding_user(user_input)
        mlp_item_latent = mlp_embedding_item(item_input)

    # Element-wise multiply
    mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])

    # Concatenation of two latent features
    mlp_vector = tf.keras.layers.concatenate(
        [mlp_user_latent, mlp_item_latent])

    num_layer = len(model_layers)  # Number of layers in the MLP
    for layer in xrange(1, num_layer):
        model_layer = tf.keras.layers.Dense(
            model_layers[layer],
            kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]),
            activation="relu")
        mlp_vector = model_layer(mlp_vector)

    # Concatenate GMF and MLP parts
    predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector])

    # Final prediction layer
    logits = tf.keras.layers.Dense(
        1,
        activation=None,
        kernel_initializer="lecun_uniform",
        name=movielens.RATING_COLUMN)(predict_vector)

    # Print model topology.
    model = tf.keras.models.Model([user_input, item_input], logits)
    model.summary()
    sys.stdout.flush()

    return model
Exemplo n.º 7
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    params = parse_flags(FLAGS)
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    if FLAGS.use_synthetic_data:
        producer = data_pipeline.DummyConstructor()
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            params=params,
            constructor_type=FLAGS.constructor_type,
            deterministic=FLAGS.seed is not None)

        num_train_steps = (producer.train_batches_per_epoch //
                           params["batches_per_step"])
        num_eval_steps = (producer.eval_batches_per_epoch //
                          params["batches_per_step"])
        assert not producer.train_batches_per_epoch % params["batches_per_step"]
        assert not producer.eval_batches_per_epoch % params["batches_per_step"]
    producer.start()

    params["num_users"], params["num_items"] = num_users, num_items
    model_helpers.apply_clean(flags.FLAGS)

    estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

    benchmark_logger, train_hooks = log_and_get_hooks(
        params["eval_batch_size"])

    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        train_input_fn = producer.make_input_fn(is_training=True)
        estimator.train(input_fn=train_input_fn,
                        hooks=train_hooks,
                        steps=num_train_steps)

        tf.logging.info("Beginning evaluation.")
        eval_input_fn = producer.make_input_fn(is_training=False)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
        tf.logging.info("Evaluation complete.")

        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        loss = float(eval_results["loss"])

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info(
            "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
                cycle_index + 1, hr, ndcg, loss))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    producer.stop_loop()
    producer.join()

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemplo n.º 8
0
def construct_model(users, items, params):
  # type: (tf.Tensor, tf.Tensor, dict) -> tf.keras.Model
  """Initialize NeuMF model.

  Args:
    users: Tensor of user ids.
    items: Tensor of item ids.
    params: Dict of hyperparameters.
  Raises:
    ValueError: if the first model layer is not even.
  Returns:
    model:  a keras Model for computing the logits
  """
  num_users = params["num_users"]
  num_items = params["num_items"]

  model_layers = params["model_layers"]

  mf_regularization = params["mf_regularization"]
  mlp_reg_layers = params["mlp_reg_layers"]

  mf_dim = params["mf_dim"]

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MF_DIM, value=mf_dim)
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MLP_LAYER_SIZES,
                          value=model_layers)

  if model_layers[0] % 2 != 0:
    raise ValueError("The first layer size should be multiple of 2!")

  # Input variables
  user_input = tf.keras.layers.Input(tensor=users, name="user_input")
  item_input = tf.keras.layers.Input(tensor=items, name="item_input")

  # Initializer for embedding layers
  embedding_initializer = "glorot_uniform"

  # It turns out to be significantly more effecient to store the MF and MLP
  # embedding portions in the same table, and then slice as needed.
  mf_slice_fn = lambda x: x[:, :mf_dim]
  mlp_slice_fn = lambda x: x[:, mf_dim:]
  embedding_user = tf.keras.layers.Embedding(
      num_users, mf_dim + model_layers[0] // 2,
      embeddings_initializer=embedding_initializer,
      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
      input_length=1, name="embedding_user")(user_input)

  embedding_item = tf.keras.layers.Embedding(
      num_items, mf_dim + model_layers[0] // 2,
      embeddings_initializer=embedding_initializer,
      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
      input_length=1, name="embedding_item")(item_input)

  # GMF part
  mf_user_latent = tf.keras.layers.Lambda(
      mf_slice_fn, name="embedding_user_mf")(embedding_user)
  mf_item_latent = tf.keras.layers.Lambda(
      mf_slice_fn, name="embedding_item_mf")(embedding_item)

  # MLP part
  mlp_user_latent = tf.keras.layers.Lambda(
      mlp_slice_fn, name="embedding_user_mlp")(embedding_user)
  mlp_item_latent = tf.keras.layers.Lambda(
      mlp_slice_fn, name="embedding_item_mlp")(embedding_item)

  # Element-wise multiply
  mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])

  # Concatenation of two latent features
  mlp_vector = tf.keras.layers.concatenate([mlp_user_latent, mlp_item_latent])

  num_layer = len(model_layers)  # Number of layers in the MLP
  for layer in xrange(1, num_layer):
    model_layer = tf.keras.layers.Dense(
        model_layers[layer],
        kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]),
        activation="relu")
    mlp_vector = model_layer(mlp_vector)

  # Concatenate GMF and MLP parts
  predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector])

  # Final prediction layer
  logits = tf.keras.layers.Dense(
      1, activation=None, kernel_initializer="lecun_uniform",
      name=movielens.RATING_COLUMN)(predict_vector)

  # Print model topology.
  model = tf.keras.models.Model([user_input, item_input], logits)
  model.summary()
  sys.stdout.flush()

  return model
Exemplo n.º 9
0
def _construct_records(
    is_training,          # type: bool
    train_cycle,          # type: typing.Optional[int]
    num_workers,          # type: int
    cache_paths,          # type: rconst.Paths
    num_readers,          # type: int
    num_neg,              # type: int
    num_positives,        # type: int
    num_items,            # type: int
    epochs_per_cycle,     # type: int
    batch_size,           # type: int
    training_shards,      # type: typing.List[str]
    deterministic=False,  # type: bool
    match_mlperf=False    # type: bool
    ):
  """Generate false negatives and write TFRecords files.

  Args:
    is_training: Are training records (True) or eval records (False) created.
    train_cycle: Integer of which cycle the generated data is for.
    num_workers: Number of multiprocessing workers to use for negative
      generation.
    cache_paths: Paths object with information of where to write files.
    num_readers: The number of reader datasets in the input_fn. This number is
      approximate; fewer shards will be created if not all shards are assigned
      batches. This can occur due to discretization in the assignment process.
    num_neg: The number of false negatives per positive example.
    num_positives: The number of positive examples. This value is used
      to pre-allocate arrays while the imap is still running. (NumPy does not
      allow dynamic arrays.)
    num_items: The cardinality of the item set.
    epochs_per_cycle: The number of epochs worth of data to construct.
    batch_size: The expected batch size used during training. This is used
      to properly batch data when writing TFRecords.
    training_shards: The picked positive examples from which to generate
      negatives.
  """
  st = timeit.default_timer()

  if is_training:
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN)
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg)

    # set inside _process_shard()
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)

  else:
    # Later logic assumes that all items for a given user are in the same batch.
    assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1)
    assert num_neg == rconst.NUM_EVAL_NEGATIVES

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            value=num_positives)

  assert epochs_per_cycle == 1 or is_training
  num_workers = min([num_workers, len(training_shards) * epochs_per_cycle])

  num_pts = num_positives * (1 + num_neg)

  # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without
  # precision concerns
  num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size
  num_padding = num_pts_with_padding - num_pts

  # We choose a different random seed for each process, so that the processes
  # will not all choose the same random numbers.
  process_seeds = [stat_utils.random_int32()
                   for _ in training_shards * epochs_per_cycle]
  map_args = [
      (shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf)
      for i, shard in enumerate(training_shards * epochs_per_cycle)]

  with popen_helper.get_pool(num_workers, init_worker) as pool:
    map_fn = pool.imap if deterministic else pool.imap_unordered  # pylint: disable=no-member
    data_generator = map_fn(_process_shard, map_args)
    data = [
        np.zeros(shape=(num_pts_with_padding,), dtype=np.int32) - 1,
        np.zeros(shape=(num_pts_with_padding,), dtype=np.uint16),
        np.zeros(shape=(num_pts_with_padding,), dtype=np.int8),
    ]

    # Training data is shuffled. Evaluation data MUST not be shuffled.
    # Downstream processing depends on the fact that evaluation data for a given
    # user is grouped within a batch.
    if is_training:
      index_destinations = np.random.permutation(num_pts)
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
    else:
      index_destinations = np.arange(num_pts)

    start_ind = 0
    for data_segment in data_generator:
      n_in_segment = data_segment[0].shape[0]
      dest = index_destinations[start_ind:start_ind + n_in_segment]
      start_ind += n_in_segment
      for i in range(3):
        data[i][dest] = data_segment[i]

  assert np.sum(data[0] == -1) == num_padding

  if is_training:
    if num_padding:
      # In order to have a full batch, randomly include points from earlier in
      # the batch.

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
      pad_sample_indices = np.random.randint(
          low=0, high=num_pts, size=(num_padding,))
      dest = np.arange(start=start_ind, stop=start_ind + num_padding)
      start_ind += num_padding
      for i in range(3):
        data[i][dest] = data[i][pad_sample_indices]
  else:
    # For Evaluation, padding is all zeros. The evaluation input_fn knows how
    # to interpret and discard the zero padded entries.
    data[0][num_pts:] = 0

  # Check that no points were overlooked.
  assert not np.sum(data[0] == -1)

  if is_training:
    # The number of points is slightly larger than num_pts due to padding.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE,
                            value=int(data[0].shape[0]))
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE,
                            value=batch_size)
  else:
    # num_pts is logged instead of int(data[0].shape[0]), because the size
    # of the data vector includes zero pads which are ignored.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts)

  batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers)
  current_file_id = -1
  current_batch_id = -1
  batches_by_file = [[] for _ in range(num_readers)]

  while True:
    current_batch_id += 1
    if (current_batch_id % batches_per_file) == 0:
      current_file_id += 1

    start_ind = current_batch_id * batch_size
    end_ind = start_ind + batch_size
    if end_ind > num_pts_with_padding:
      if start_ind != num_pts_with_padding:
        raise ValueError("Batch padding does not line up")
      break
    batches_by_file[current_file_id].append(current_batch_id)

  # Drop shards which were not assigned batches
  batches_by_file = [i for i in batches_by_file if i]
  num_readers = len(batches_by_file)

  if is_training:
    # Empirically it is observed that placing the batch with repeated values at
    # the start rather than the end improves convergence.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
    batches_by_file[0][0], batches_by_file[-1][-1] = \
      batches_by_file[-1][-1], batches_by_file[0][0]

  if is_training:
    template = rconst.TRAIN_RECORD_TEMPLATE
    record_dir = os.path.join(cache_paths.train_epoch_dir,
                              get_cycle_folder_name(train_cycle))
    tf.gfile.MakeDirs(record_dir)
  else:
    template = rconst.EVAL_RECORD_TEMPLATE
    record_dir = cache_paths.eval_data_subdir

  batch_count = 0
  for i in range(num_readers):
    fpath = os.path.join(record_dir, template.format(i))
    log_msg("Writing {}".format(fpath))
    with tf.python_io.TFRecordWriter(fpath) as writer:
      for j in batches_by_file[i]:
        start_ind = j * batch_size
        end_ind = start_ind + batch_size
        record_kwargs = dict(
            users=data[0][start_ind:end_ind],
            items=data[1][start_ind:end_ind],
        )

        if is_training:
          record_kwargs["labels"] = data[2][start_ind:end_ind]
        else:
          record_kwargs["dupe_mask"] = stat_utils.mask_duplicates(
              record_kwargs["items"].reshape(-1, num_neg + 1),
              axis=1).flatten().astype(np.int8)

        batch_bytes = _construct_record(**record_kwargs)

        writer.write(batch_bytes)
        batch_count += 1

  # We write to a temp file then atomically rename it to the final file, because
  # writing directly to the final file can cause the main process to read a
  # partially written JSON file.
  ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
  with tf.gfile.Open(ready_file_temp, "w") as f:
    json.dump({
        "batch_size": batch_size,
        "batch_count": batch_count,
    }, f)
  ready_file = os.path.join(record_dir, rconst.READY_FILE)
  tf.gfile.Rename(ready_file_temp, ready_file)

  if is_training:
    log_msg("Cycle {} complete. Total time: {:.1f} seconds"
            .format(train_cycle, timeit.default_timer() - st))
  else:
    log_msg("Eval construction complete. Total time: {:.1f} seconds"
            .format(timeit.default_timer() - st))
Exemplo n.º 10
0
def _construct_records(
        is_training,  # type: bool
        train_cycle,  # type: typing.Optional[int]
        num_workers,  # type: int
        cache_paths,  # type: rconst.Paths
        num_readers,  # type: int
        num_neg,  # type: int
        num_positives,  # type: int
        num_items,  # type: int
        epochs_per_cycle,  # type: int
        batch_size,  # type: int
        training_shards,  # type: typing.List[str]
        deterministic=False,  # type: bool
        match_mlperf=False  # type: bool
):
    """Generate false negatives and write TFRecords files.

  Args:
    is_training: Are training records (True) or eval records (False) created.
    train_cycle: Integer of which cycle the generated data is for.
    num_workers: Number of multiprocessing workers to use for negative
      generation.
    cache_paths: Paths object with information of where to write files.
    num_readers: The number of reader datasets in the input_fn. This number is
      approximate; fewer shards will be created if not all shards are assigned
      batches. This can occur due to discretization in the assignment process.
    num_neg: The number of false negatives per positive example.
    num_positives: The number of positive examples. This value is used
      to pre-allocate arrays while the imap is still running. (NumPy does not
      allow dynamic arrays.)
    num_items: The cardinality of the item set.
    epochs_per_cycle: The number of epochs worth of data to construct.
    batch_size: The expected batch size used during training. This is used
      to properly batch data when writing TFRecords.
    training_shards: The picked positive examples from which to generate
      negatives.
  """
    st = timeit.default_timer()

    if is_training:
        mlperf_helper.ncf_print(
            key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN)
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG,
                                value=num_neg)

        # set inside _process_shard()
        mlperf_helper.ncf_print(
            key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT,
            value=True)

    else:
        # Later logic assumes that all items for a given user are in the same batch.
        assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1)
        assert num_neg == rconst.NUM_EVAL_NEGATIVES

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                value=num_positives)

    assert epochs_per_cycle == 1 or is_training
    num_workers = min([num_workers, len(training_shards) * epochs_per_cycle])

    num_pts = num_positives * (1 + num_neg)

    # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without
    # precision concerns
    num_pts_with_padding = (num_pts + batch_size -
                            1) // batch_size * batch_size
    num_padding = num_pts_with_padding - num_pts

    # We choose a different random seed for each process, so that the processes
    # will not all choose the same random numbers.
    process_seeds = [
        stat_utils.random_int32() for _ in training_shards * epochs_per_cycle
    ]
    map_args = [(shard, num_items, num_neg, process_seeds[i], is_training,
                 match_mlperf)
                for i, shard in enumerate(training_shards * epochs_per_cycle)]

    with popen_helper.get_pool(num_workers, init_worker) as pool:
        map_fn = pool.imap if deterministic else pool.imap_unordered  # pylint: disable=no-member
        data_generator = map_fn(_process_shard, map_args)
        data = [
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.int32) - 1,
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.uint16),
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.int8),
        ]

        # Training data is shuffled. Evaluation data MUST not be shuffled.
        # Downstream processing depends on the fact that evaluation data for a given
        # user is grouped within a batch.
        if is_training:
            index_destinations = np.random.permutation(num_pts)
            mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
        else:
            index_destinations = np.arange(num_pts)

        start_ind = 0
        for data_segment in data_generator:
            n_in_segment = data_segment[0].shape[0]
            dest = index_destinations[start_ind:start_ind + n_in_segment]
            start_ind += n_in_segment
            for i in range(3):
                data[i][dest] = data_segment[i]

    assert np.sum(data[0] == -1) == num_padding

    if is_training:
        if num_padding:
            # In order to have a full batch, randomly include points from earlier in
            # the batch.

            mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
            pad_sample_indices = np.random.randint(low=0,
                                                   high=num_pts,
                                                   size=(num_padding, ))
            dest = np.arange(start=start_ind, stop=start_ind + num_padding)
            start_ind += num_padding
            for i in range(3):
                data[i][dest] = data[i][pad_sample_indices]
    else:
        # For Evaluation, padding is all zeros. The evaluation input_fn knows how
        # to interpret and discard the zero padded entries.
        data[0][num_pts:] = 0

    # Check that no points were overlooked.
    assert not np.sum(data[0] == -1)

    if is_training:
        # The number of points is slightly larger than num_pts due to padding.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE,
                                value=int(data[0].shape[0]))
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE,
                                value=batch_size)
    else:
        # num_pts is logged instead of int(data[0].shape[0]), because the size
        # of the data vector includes zero pads which are ignored.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE,
                                value=num_pts)

    batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers)
    current_file_id = -1
    current_batch_id = -1
    batches_by_file = [[] for _ in range(num_readers)]

    while True:
        current_batch_id += 1
        if (current_batch_id % batches_per_file) == 0:
            current_file_id += 1

        start_ind = current_batch_id * batch_size
        end_ind = start_ind + batch_size
        if end_ind > num_pts_with_padding:
            if start_ind != num_pts_with_padding:
                raise ValueError("Batch padding does not line up")
            break
        batches_by_file[current_file_id].append(current_batch_id)

    # Drop shards which were not assigned batches
    batches_by_file = [i for i in batches_by_file if i]
    num_readers = len(batches_by_file)

    if is_training:
        # Empirically it is observed that placing the batch with repeated values at
        # the start rather than the end improves convergence.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
        batches_by_file[0][0], batches_by_file[-1][-1] = \
          batches_by_file[-1][-1], batches_by_file[0][0]

    if is_training:
        template = rconst.TRAIN_RECORD_TEMPLATE
        record_dir = os.path.join(cache_paths.train_epoch_dir,
                                  get_cycle_folder_name(train_cycle))
        tf.gfile.MakeDirs(record_dir)
    else:
        template = rconst.EVAL_RECORD_TEMPLATE
        record_dir = cache_paths.eval_data_subdir

    batch_count = 0
    for i in range(num_readers):
        fpath = os.path.join(record_dir, template.format(i))
        log_msg("Writing {}".format(fpath))
        with tf.python_io.TFRecordWriter(fpath) as writer:
            for j in batches_by_file[i]:
                start_ind = j * batch_size
                end_ind = start_ind + batch_size
                record_kwargs = dict(
                    users=data[0][start_ind:end_ind],
                    items=data[1][start_ind:end_ind],
                )

                if is_training:
                    record_kwargs["labels"] = data[2][start_ind:end_ind]
                else:
                    record_kwargs["dupe_mask"] = stat_utils.mask_duplicates(
                        record_kwargs["items"].reshape(-1, num_neg + 1),
                        axis=1).flatten().astype(np.int8)

                batch_bytes = _construct_record(**record_kwargs)

                writer.write(batch_bytes)
                batch_count += 1

    # We write to a temp file then atomically rename it to the final file, because
    # writing directly to the final file can cause the main process to read a
    # partially written JSON file.
    ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
    with tf.gfile.Open(ready_file_temp, "w") as f:
        json.dump({
            "batch_size": batch_size,
            "batch_count": batch_count,
        }, f)
    ready_file = os.path.join(record_dir, rconst.READY_FILE)
    tf.gfile.Rename(ready_file_temp, ready_file)

    if is_training:
        log_msg("Cycle {} complete. Total time: {:.1f} seconds".format(
            train_cycle,
            timeit.default_timer() - st))
    else:
        log_msg(
            "Eval construction complete. Total time: {:.1f} seconds".format(
                timeit.default_timer() - st))
Exemplo n.º 11
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    params = ncf_common.parse_flags(FLAGS)

    num_users, num_items, num_train_steps, num_eval_steps, producer = (
        ncf_common.get_inputs(params))

    params["num_users"], params["num_items"] = num_users, num_items
    producer.start()
    model_helpers.apply_clean(flags.FLAGS)

    estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

    benchmark_logger, train_hooks = log_and_get_hooks(
        params["eval_batch_size"])
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        train_input_fn = producer.make_input_fn(is_training=True)
        estimator.train(input_fn=train_input_fn,
                        hooks=train_hooks,
                        steps=num_train_steps)

        logging.info("Beginning evaluation.")
        eval_input_fn = producer.make_input_fn(is_training=False)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
        logging.info("Evaluation complete.")

        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        loss = float(eval_results["loss"])

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        logging.info(
            "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
                cycle_index + 1, hr, ndcg, loss))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break
    #May be better with shape 1, this is the floor of the input at predict time
    def serving_input_fn():
        x = tf.placeholder(dtype=tf.int64,
                           shape=[1],
                           name=movielens.USER_COLUMN)
        y = tf.placeholder(dtype=tf.int64,
                           shape=[1],
                           name=movielens.ITEM_COLUMN)
        mask = tf.placeholder(dtype=tf.float32,
                              shape=[1],
                              name="duplicate_mask")

        inputs = {
            movielens.USER_COLUMN: x,
            movielens.ITEM_COLUMN: y,
            "duplicate_mask": mask
        }

        return tf.estimator.export.ServingInputReceiver(inputs, inputs)

    saved_model_dir = "saved_model"
    estimator.export_saved_model(saved_model_dir, serving_input_fn)
    print("saved")
    subdirs = [
        x for x in Path(saved_model_dir).iterdir()
        if x.is_dir() and 'temp' not in str(x)
    ]
    latest = str(sorted(subdirs)[-1])

    with tf.Session() as sess_tf:
        loaded = tf.saved_model.loader.load(
            sess_tf, [tf.saved_model.tag_constants.SERVING], latest)

        graph = loaded.graph_def
        output_name = "concat:0"
        tf.import_graph_def(graph, name='')
        print("loaded")

        for n in graph.node:
            print('\n', n)
        frozen_graph = loader.freeze_session(sess_tf,
                                             output_names=[output_name])

    tf.reset_default_graph()
    with tf.Session() as sess_tf:

        tf.import_graph_def(frozen_graph, name='')

        print(type(frozen_graph))

        onnx_graph = process_tf_graph(sess_tf.graph,
                                      opset=7,
                                      input_names=["userid:0", "itemid:0"],
                                      output_names=[output_name])
        model_proto = onnx_graph.make_model("ncf")
        onnx_model_string = model_proto.SerializeToString()

        #out_file = open("newNCF.onnx", "wb")
        #out_file.write(onnx_model_string)
        #out_file.close()
        onnx_model_bytes = bytearray(onnx_model_string)
        movielens.run_pio_workflow(onnx_model_bytes, movielens.user_map,
                                   movielens.item_map, orig_sys_args)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    producer.stop_loop()
    producer.join()

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemplo n.º 12
0
def neumf_model_fn(features, labels, mode, params):
    """Model Function for NeuMF estimator."""
    if params.get("use_seed"):
        tf.set_random_seed(stat_utils.random_int32())

    users = features[movielens.USER_COLUMN]
    items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32)

    logits = construct_model(users=users, items=items, params=params)

    # Softmax with the first column of zeros is equivalent to sigmoid.
    softmax_logits = tf.concat(
        [tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            movielens.ITEM_COLUMN: items,
            movielens.RATING_COLUMN: logits,
        }

        if params["use_tpu"]:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   predictions=predictions)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    elif mode == tf.estimator.ModeKeys.EVAL:
        duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
        return compute_eval_loss_and_metrics(logits,
                                             softmax_logits,
                                             duplicate_mask,
                                             params["num_neg"],
                                             params["match_mlperf"],
                                             use_tpu_spec=params["use_tpu"]
                                             or params["use_xla_for_gpu"])

    elif mode == tf.estimator.ModeKeys.TRAIN:
        labels = tf.cast(labels, tf.int32)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
                                value=params["learning_rate"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1,
                                value=params["beta1"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2,
                                value=params["beta2"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                                value=params["epsilon"])

        optimizer = tf.train.AdamOptimizer(
            learning_rate=params["learning_rate"],
            beta1=params["beta1"],
            beta2=params["beta2"],
            epsilon=params["epsilon"])
        if params["use_tpu"]:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN,
                                value=mlperf_helper.TAGS.BCE)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=softmax_logits)

        # This tensor is used by logging hooks.
        tf.identity(loss, name="cross_entropy")

        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        gradients = _sparse_to_dense_grads(gradients)
        minimize_op = optimizer.apply_gradients(gradients,
                                                global_step=global_step,
                                                name="train")
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops)

        if params["use_tpu"]:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   loss=loss,
                                                   train_op=train_op)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)

    else:
        raise NotImplementedError
Exemplo n.º 13
0
def construct_model(users, items, params):
    # type: (tf.Tensor, tf.Tensor, dict) -> tf.Tensor
    """Initialize NeuMF model.

  Args:
    users: Tensor of user ids.
    items: Tensor of item ids.
    params: Dict of hyperparameters.

  Raises:
    ValueError: if the first model layer is not even.
  """

    num_users = params["num_users"]
    num_items = params["num_items"]

    model_layers = params["model_layers"]

    mf_regularization = params["mf_regularization"]
    mlp_reg_layers = params["mlp_reg_layers"]

    mf_dim = params["mf_dim"]

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MF_DIM,
                            value=mf_dim)
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_MLP_LAYER_SIZES,
                            value=model_layers)

    if model_layers[0] % 2 != 0:
        raise ValueError("The first layer size should be multiple of 2!")

    # Input variables
    user_input = tf.keras.layers.Input(tensor=users)
    item_input = tf.keras.layers.Input(tensor=items)

    # Initializer for embedding layers
    embedding_initializer = "glorot_uniform"

    # Embedding layers of GMF and MLP
    mf_embedding_user = tf.keras.layers.Embedding(
        num_users,
        mf_dim,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
        input_length=1)
    mf_embedding_item = tf.keras.layers.Embedding(
        num_items,
        mf_dim,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
        input_length=1)

    mlp_embedding_user = tf.keras.layers.Embedding(
        num_users,
        model_layers[0] // 2,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
        input_length=1)
    mlp_embedding_item = tf.keras.layers.Embedding(
        num_items,
        model_layers[0] // 2,
        embeddings_initializer=embedding_initializer,
        embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
        input_length=1)

    # GMF part
    mf_user_latent = mf_embedding_user(user_input)
    mf_item_latent = mf_embedding_item(item_input)
    # Element-wise multiply
    mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])

    # MLP part
    mlp_user_latent = mlp_embedding_user(user_input)
    mlp_item_latent = mlp_embedding_item(item_input)
    # Concatenation of two latent features
    mlp_vector = tf.keras.layers.concatenate(
        [mlp_user_latent, mlp_item_latent])

    num_layer = len(model_layers)  # Number of layers in the MLP
    for layer in xrange(1, num_layer):
        model_layer = tf.keras.layers.Dense(
            model_layers[layer],
            kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]),
            activation="relu")
        mlp_vector = model_layer(mlp_vector)

    # Concatenate GMF and MLP parts
    predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector])

    # Final prediction layer
    logits = tf.keras.layers.Dense(
        1,
        activation=None,
        kernel_initializer="lecun_uniform",
        name=movielens.RATING_COLUMN)(predict_vector)

    # Print model topology.
    tf.keras.models.Model([user_input, item_input], logits).summary()
    sys.stdout.flush()

    return logits
def _filter_index_sort(raw_rating_path, cache_path):
  # type: (str, str, bool) -> (dict, bool)
  """Read in data CSV, and output structured data.

  This function reads in the raw CSV of positive items, and performs three
  preprocessing transformations:

  1)  Filter out all users who have not rated at least a certain number
      of items. (Typically 20 items)

  2)  Zero index the users and items such that the largest user_id is
      `num_users - 1` and the largest item_id is `num_items - 1`

  3)  Sort the dataframe by user_id, with timestamp as a secondary sort key.
      This allows the dataframe to be sliced by user in-place, and for the last
      item to be selected simply by calling the `-1` index of a user's slice.

  While all of these transformations are performed by Pandas (and are therefore
  single-threaded), they only take ~2 minutes, and the overhead to apply a
  MapReduce pattern to parallel process the dataset adds significant complexity
  for no computational gain. For a larger dataset parallelizing this
  preprocessing could yield speedups. (Also, this preprocessing step is only
  performed once for an entire run.

  Args:
    raw_rating_path: The path to the CSV which contains the raw dataset.
    cache_path: The path to the file where results of this function are saved.

  Returns:
    A filtered, zero-index remapped, sorted dataframe, a dict mapping raw user
    IDs to regularized user IDs, and a dict mapping raw item IDs to regularized
    item IDs.
  """
  valid_cache = tf.io.gfile.exists(cache_path)
  if valid_cache:
    with tf.io.gfile.GFile(cache_path, "rb") as f:
      cached_data = pickle.load(f)

    cache_age = time.time() - cached_data.get("create_time", 0)
    if cache_age > rconst.CACHE_INVALIDATION_SEC:
      valid_cache = False

    for key in _EXPECTED_CACHE_KEYS:
      if key not in cached_data:
        valid_cache = False

    if not valid_cache:
      logging.info("Removing stale raw data cache file.")
      tf.io.gfile.remove(cache_path)

  if valid_cache:
    data = cached_data
  else:
    with tf.io.gfile.GFile(raw_rating_path) as f:
      df = pd.read_csv(f)

    # Get the info of users who have more than 20 ratings on items
    grouped = df.groupby(movielens.USER_COLUMN)
    df = grouped.filter(
        lambda x: len(x) >= rconst.MIN_NUM_RATINGS) # type: pd.DataFrame

    original_users = df[movielens.USER_COLUMN].unique()
    original_items = df[movielens.ITEM_COLUMN].unique()

    # Map the ids of user and item to 0 based index for following processing
    logging.info("Generating user_map and item_map...")
    user_map = {user: index for index, user in enumerate(original_users)}
    item_map = {item: index for index, item in enumerate(original_items)}

    df[movielens.USER_COLUMN] = df[movielens.USER_COLUMN].apply(
        lambda user: user_map[user])
    df[movielens.ITEM_COLUMN] = df[movielens.ITEM_COLUMN].apply(
        lambda item: item_map[item])

    num_users = len(original_users)
    num_items = len(original_items)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.PREPROC_HP_NUM_EVAL,
                            value=rconst.NUM_EVAL_NEGATIVES)

    assert num_users <= np.iinfo(rconst.USER_DTYPE).max
    assert num_items <= np.iinfo(rconst.ITEM_DTYPE).max
    assert df[movielens.USER_COLUMN].max() == num_users - 1
    assert df[movielens.ITEM_COLUMN].max() == num_items - 1

    # This sort is used to shard the dataframe by user, and later to select
    # the last item for a user to be used in validation.
    logging.info("Sorting by user, timestamp...")

    # This sort is equivalent to
    #   df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
    #   inplace=True)
    # except that the order of items with the same user and timestamp are
    # sometimes different. For some reason, this sort results in a better
    # hit-rate during evaluation, matching the performance of the MLPerf
    # reference implementation.
    df.sort_values(by=movielens.TIMESTAMP_COLUMN, inplace=True)
    df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
                   inplace=True, kind="mergesort")

    df = df.reset_index()  # The dataframe does not reconstruct indices in the
                           # sort or filter steps.

    grouped = df.groupby(movielens.USER_COLUMN, group_keys=False)
    eval_df, train_df = grouped.tail(1), grouped.apply(lambda x: x.iloc[:-1])

    data = {
        rconst.TRAIN_USER_KEY: train_df[movielens.USER_COLUMN]
                               .values.astype(rconst.USER_DTYPE),
        rconst.TRAIN_ITEM_KEY: train_df[movielens.ITEM_COLUMN]
                               .values.astype(rconst.ITEM_DTYPE),
        rconst.EVAL_USER_KEY: eval_df[movielens.USER_COLUMN]
                              .values.astype(rconst.USER_DTYPE),
        rconst.EVAL_ITEM_KEY: eval_df[movielens.ITEM_COLUMN]
                              .values.astype(rconst.ITEM_DTYPE),
        rconst.USER_MAP: user_map,
        rconst.ITEM_MAP: item_map,
        "create_time": time.time(),
    }

    logging.info("Writing raw data cache.")
    with tf.io.gfile.GFile(cache_path, "wb") as f:
      pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

  # TODO(robieta): MLPerf cache clear.
  return data, valid_cache
Exemplo n.º 15
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_gpus = flags_core.get_num_gpus(FLAGS)
    batch_size = distribution_utils.per_device_batch_size(
        int(FLAGS.batch_size), num_gpus)

    eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    eval_batch_size = int(FLAGS.eval_batch_size
                          or max([FLAGS.batch_size, eval_per_user]))
    if eval_batch_size % eval_per_user:
        eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
        tf.logging.warning(
            "eval examples per user does not evenly divide eval_batch_size. "
            "Overriding to {}".format(eval_batch_size))

    if FLAGS.use_synthetic_data:
        ncf_dataset = None
        cleanup_fn = lambda: None
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            batch_size=batch_size,
            eval_batch_size=eval_batch_size,
            num_neg=FLAGS.num_neg,
            epochs_per_cycle=FLAGS.epochs_between_evals,
            match_mlperf=FLAGS.ml_perf,
            deterministic=FLAGS.seed is not None,
            use_subprocess=FLAGS.use_subprocess,
            cache_id=FLAGS.cache_id)
        num_users = ncf_dataset.num_users
        num_items = ncf_dataset.num_items
        num_train_steps = int(
            np.ceil(FLAGS.epochs_between_evals *
                    ncf_dataset.num_train_positives * (1 + FLAGS.num_neg) /
                    FLAGS.batch_size))
        num_eval_steps = int(
            np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) * ncf_dataset.num_users /
                    eval_batch_size))

    model_helpers.apply_clean(flags.FLAGS)

    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus,
        model_dir=FLAGS.model_dir,
        params={
            "use_seed": FLAGS.seed is not None,
            "hash_pipeline": FLAGS.hash_pipeline,
            "batch_size": batch_size,
            "eval_batch_size": eval_batch_size,
            "learning_rate": FLAGS.learning_rate,
            "num_users": num_users,
            "num_items": num_items,
            "mf_dim": FLAGS.num_factors,
            "model_layers": [int(layer) for layer in FLAGS.layers],
            "mf_regularization": FLAGS.mf_regularization,
            "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
            "num_neg": FLAGS.num_neg,
            "use_tpu": FLAGS.tpu is not None,
            "tpu": FLAGS.tpu,
            "tpu_zone": FLAGS.tpu_zone,
            "tpu_gcp_project": FLAGS.tpu_gcp_project,
            "beta1": FLAGS.beta1,
            "beta2": FLAGS.beta2,
            "epsilon": FLAGS.epsilon,
            "match_mlperf": FLAGS.ml_perf,
            "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
        },
        batch_size=flags.FLAGS.batch_size,
        eval_batch_size=eval_batch_size)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        FLAGS.hooks,
        model_dir=FLAGS.model_dir,
        batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
        tensors_to_log={"cross_entropy": "cross_entropy"})
    run_params = {
        "batch_size": FLAGS.batch_size,
        "eval_batch_size": eval_batch_size,
        "number_factors": FLAGS.num_factors,
        "hr_threshold": FLAGS.hr_threshold,
        "train_epochs": FLAGS.train_epochs,
    }
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="recommendation",
                                  dataset_name=FLAGS.dataset,
                                  run_params=run_params,
                                  test_id=FLAGS.benchmark_test_id)

    pred_input_fn = None
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        tf.logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        # Train the model
        train_input_fn, train_record_dir, batch_count = \
          data_preprocessing.make_input_fn(
              ncf_dataset=ncf_dataset, is_training=True)

        if batch_count != num_train_steps:
            raise ValueError(
                "Step counts do not match. ({} vs. {}) The async process is "
                "producing incorrect shards.".format(batch_count,
                                                     num_train_steps))

        train_estimator.train(input_fn=train_input_fn,
                              hooks=train_hooks,
                              steps=num_train_steps)
        if train_record_dir:
            tf.gfile.DeleteRecursively(train_record_dir)

        tf.logging.info("Beginning evaluation.")
        if pred_input_fn is None:
            pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
                ncf_dataset=ncf_dataset, is_training=False)

            if eval_batch_count != num_eval_steps:
                raise ValueError(
                    "Step counts do not match. ({} vs. {}) The async process is "
                    "producing incorrect shards.".format(
                        eval_batch_count, num_eval_steps))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = eval_estimator.evaluate(pred_input_fn,
                                               steps=num_eval_steps)
        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        tf.logging.info("Evaluation complete.")

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        # Logged by the async process during record creation.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                deferred=True)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    cleanup_fn()  # Cleanup data construction artifacts and subprocess.

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemplo n.º 16
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  num_gpus = flags_core.get_num_gpus(FLAGS)
  batch_size = distribution_utils.per_device_batch_size(
      int(FLAGS.batch_size), num_gpus)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
  eval_batch_size = int(FLAGS.eval_batch_size or
                        max([FLAGS.batch_size, eval_per_user]))
  if eval_batch_size % eval_per_user:
    eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
    tf.logging.warning(
        "eval examples per user does not evenly divide eval_batch_size. "
        "Overriding to {}".format(eval_batch_size))

  if FLAGS.use_synthetic_data:
    ncf_dataset = None
    cleanup_fn = lambda: None
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_neg=FLAGS.num_neg,
        epochs_per_cycle=FLAGS.epochs_between_evals,
        num_cycles=total_training_cycle,
        match_mlperf=FLAGS.ml_perf,
        deterministic=FLAGS.seed is not None,
        use_subprocess=FLAGS.use_subprocess,
        cache_id=FLAGS.cache_id)
    num_users = ncf_dataset.num_users
    num_items = ncf_dataset.num_items
    num_train_steps = int(np.ceil(
        FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
        (1 + FLAGS.num_neg) / FLAGS.batch_size))
    num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
                                 ncf_dataset.num_users / eval_batch_size))

  model_helpers.apply_clean(flags.FLAGS)

  params = {
      "use_seed": FLAGS.seed is not None,
      "hash_pipeline": FLAGS.hash_pipeline,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "learning_rate": FLAGS.learning_rate,
      "num_users": num_users,
      "num_items": num_items,
      "mf_dim": FLAGS.num_factors,
      "model_layers": [int(layer) for layer in FLAGS.layers],
      "mf_regularization": FLAGS.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
      "num_neg": FLAGS.num_neg,
      "use_tpu": FLAGS.tpu is not None,
      "tpu": FLAGS.tpu,
      "tpu_zone": FLAGS.tpu_zone,
      "tpu_gcp_project": FLAGS.tpu_gcp_project,
      "beta1": FLAGS.beta1,
      "beta2": FLAGS.beta2,
      "epsilon": FLAGS.epsilon,
      "match_mlperf": FLAGS.ml_perf,
      "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
      "use_estimator": FLAGS.use_estimator,
  }
  if FLAGS.use_estimator:
    train_estimator, eval_estimator = construct_estimator(
        num_gpus=num_gpus, model_dir=FLAGS.model_dir,
        iterations=num_train_steps, params=params,
        batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)
  else:
    runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps,
                                         num_eval_steps, FLAGS.use_while_loop)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      FLAGS.hooks,
      model_dir=FLAGS.model_dir,
      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
      tensors_to_log={"cross_entropy": "cross_entropy"}
  )
  run_params = {
      "batch_size": FLAGS.batch_size,
      "eval_batch_size": eval_batch_size,
      "number_factors": FLAGS.num_factors,
      "hr_threshold": FLAGS.hr_threshold,
      "train_epochs": FLAGS.train_epochs,
  }
  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="recommendation",
      dataset_name=FLAGS.dataset,
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)


  eval_input_fn = None
  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    # Train the model
    if FLAGS.use_estimator:
      train_input_fn, train_record_dir, batch_count = \
        data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=True)

      if batch_count != num_train_steps:
        raise ValueError(
            "Step counts do not match. ({} vs. {}) The async process is "
            "producing incorrect shards.".format(batch_count, num_train_steps))

      train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                            steps=num_train_steps)
      if train_record_dir:
        tf.gfile.DeleteRecursively(train_record_dir)

      tf.logging.info("Beginning evaluation.")
      if eval_input_fn is None:
        eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
            ncf_dataset=ncf_dataset, is_training=False)

        if eval_batch_count != num_eval_steps:
          raise ValueError(
              "Step counts do not match. ({} vs. {}) The async process is "
              "producing incorrect shards.".format(
                  eval_batch_count, num_eval_steps))

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = eval_estimator.evaluate(eval_input_fn,
                                             steps=num_eval_steps)
      tf.logging.info("Evaluation complete.")
    else:
      runner.train()
      tf.logging.info("Beginning evaluation.")
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                              value=cycle_index)
      eval_results = runner.eval()
      tf.logging.info("Evaluation complete.")
    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    # Logged by the async process during record creation.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            deferred=True)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
            cycle_index + 1, hr, ndcg))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  cleanup_fn()  # Cleanup data construction artifacts and subprocess.

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemplo n.º 17
0
def run_ncf(_):
    """Run NCF training and eval loop."""
    params = ncf_common.parse_flags(FLAGS)

    num_users, num_items, num_train_steps, num_eval_steps, producer = (
        ncf_common.get_inputs(params))

    params["num_users"], params["num_items"] = num_users, num_items
    producer.start()
    model_helpers.apply_clean(flags.FLAGS)

    estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

    benchmark_logger, train_hooks = log_and_get_hooks(
        params["eval_batch_size"])
    total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

    target_reached = False
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
    for cycle_index in range(total_training_cycle):
        assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
        logging.info("Starting a training cycle: {}/{}".format(
            cycle_index + 1, total_training_cycle))

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                                value=cycle_index)

        train_input_fn = producer.make_input_fn(is_training=True)
        estimator.train(input_fn=train_input_fn,
                        hooks=train_hooks,
                        steps=num_train_steps)

        logging.info("Beginning evaluation.")
        eval_input_fn = producer.make_input_fn(is_training=False)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                                value=cycle_index)
        eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
        logging.info("Evaluation complete.")

        hr = float(eval_results[rconst.HR_KEY])
        ndcg = float(eval_results[rconst.NDCG_KEY])
        loss = float(eval_results["loss"])

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_TARGET,
                                value={
                                    "epoch": cycle_index,
                                    "value": FLAGS.hr_threshold
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                                value={
                                    "epoch": cycle_index,
                                    "value": hr
                                })
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
                                value={
                                    "epoch": cycle_index,
                                    "value": rconst.NUM_EVAL_NEGATIVES
                                })

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP,
                                value=cycle_index)

        # Benchmark the evaluation results
        benchmark_logger.log_evaluation_result(eval_results)
        # Log the HR and NDCG results.
        logging.info(
            "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
                cycle_index + 1, hr, ndcg, loss))

        # If some evaluation threshold is met
        if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
            target_reached = True
            break

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                            value={"success": target_reached})
    producer.stop_loop()
    producer.join()

    # Clear the session explicitly to avoid session delete error
    tf.keras.backend.clear_session()
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemplo n.º 18
0
def neumf_model_fn(features, labels, mode, params):
  """Model Function for NeuMF estimator."""
  if params.get("use_seed"):
    tf.set_random_seed(stat_utils.random_int32())

  users = features[movielens.USER_COLUMN]
  items = features[movielens.ITEM_COLUMN]

  logits = construct_model(users, items, params).output

  # Softmax with the first column of zeros is equivalent to sigmoid.
  softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype),
                              logits], axis=1)

  if mode == tf.estimator.ModeKeys.EVAL:
    duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
    return compute_eval_loss_and_metrics(
        logits, softmax_logits, duplicate_mask, params["num_neg"],
        params["match_mlperf"],
        use_tpu_spec=params["use_xla_for_gpu"])

  elif mode == tf.estimator.ModeKeys.TRAIN:
    labels = tf.cast(labels, tf.int32)
    valid_pt_mask = features[rconst.VALID_POINT_MASK]

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
                            value=params["learning_rate"])
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1,
                            value=params["beta1"])
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2,
                            value=params["beta2"])
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                            value=params["epsilon"])

    optimizer = tf.train.AdamOptimizer(
        learning_rate=params["learning_rate"], beta1=params["beta1"],
        beta2=params["beta2"], epsilon=params["epsilon"])
    if params["use_tpu"]:
      optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN,
                            value=mlperf_helper.TAGS.BCE)
    loss = tf.losses.sparse_softmax_cross_entropy(
        labels=labels,
        logits=softmax_logits,
        weights=tf.cast(valid_pt_mask, tf.float32)
    )

    # This tensor is used by logging hooks.
    tf.identity(loss, name="cross_entropy")

    global_step = tf.train.get_global_step()
    tvars = tf.trainable_variables()
    gradients = optimizer.compute_gradients(
        loss, tvars, colocate_gradients_with_ops=True)
    gradients = _sparse_to_dense_grads(gradients)
    minimize_op = optimizer.apply_gradients(
        gradients, global_step=global_step, name="train")
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    train_op = tf.group(minimize_op, update_ops)

    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

  else:
    raise NotImplementedError
Exemplo n.º 19
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  params = parse_flags(FLAGS)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)

    num_train_steps = (producer.train_batches_per_epoch //
                       params["batches_per_step"])
    num_eval_steps = (producer.eval_batches_per_epoch //
                      params["batches_per_step"])
    assert not producer.train_batches_per_epoch % params["batches_per_step"]
    assert not producer.eval_batches_per_epoch % params["batches_per_step"]
  producer.start()

  params["num_users"], params["num_items"] = num_users, num_items
  model_helpers.apply_clean(flags.FLAGS)

  estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

  benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"])

  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    train_input_fn = producer.make_input_fn(is_training=True)
    estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                    steps=num_train_steps)

    tf.logging.info("Beginning evaluation.")
    eval_input_fn = producer.make_input_fn(is_training=False)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                            value=cycle_index)
    eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
    tf.logging.info("Evaluation complete.")

    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])
    loss = float(eval_results["loss"])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
            cycle_index + 1, hr, ndcg, loss))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  producer.stop_loop()
  producer.join()

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
Exemplo n.º 20
0
def neumf_model_fn(features, labels, mode, params):
    """Model Function for NeuMF estimator."""
    if params.get("use_seed"):
        tf.set_random_seed(stat_utils.random_int32())

    users = features[movielens.USER_COLUMN]
    items = features[movielens.ITEM_COLUMN]

    user_input = tf.keras.layers.Input(tensor=users)
    item_input = tf.keras.layers.Input(tensor=items)
    logits = construct_model(user_input, item_input, params).output

    # Softmax with the first column of zeros is equivalent to sigmoid.
    softmax_logits = ncf_common.convert_to_softmax_logits(logits)

    if mode == tf.estimator.ModeKeys.EVAL:
        duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
        return _get_estimator_spec_with_metrics(
            logits,
            softmax_logits,
            duplicate_mask,
            params["num_neg"],
            params["match_mlperf"],
            use_tpu_spec=params["use_xla_for_gpu"])

    elif mode == tf.estimator.ModeKeys.TRAIN:
        labels = tf.cast(labels, tf.int32)
        valid_pt_mask = features[rconst.VALID_POINT_MASK]

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
                                value=params["learning_rate"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1,
                                value=params["beta1"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2,
                                value=params["beta2"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                                value=params["epsilon"])

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=params["learning_rate"],
            beta1=params["beta1"],
            beta2=params["beta2"],
            epsilon=params["epsilon"])
        if params["use_tpu"]:
            # TODO(seemuch): remove this contrib import
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN,
                                value=mlperf_helper.TAGS.BCE)

        loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=softmax_logits,
            weights=tf.cast(valid_pt_mask, tf.float32))

        # This tensor is used by logging hooks.
        tf.identity(loss, name="cross_entropy")

        global_step = tf.compat.v1.train.get_global_step()
        tvars = tf.compat.v1.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        gradients = _sparse_to_dense_grads(gradients)
        minimize_op = optimizer.apply_gradients(gradients,
                                                global_step=global_step,
                                                name="train")
        update_ops = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)

    else:
        raise NotImplementedError
Exemplo n.º 21
0
def run_ncf(_):
  """Run NCF training and eval loop."""
  params = ncf_common.parse_flags(FLAGS)

  num_users, num_items, num_train_steps, num_eval_steps, producer = (
      ncf_common.get_inputs(params))

  params["num_users"], params["num_items"] = num_users, num_items
  producer.start()
  model_helpers.apply_clean(flags.FLAGS)

  estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

  benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"])
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
    tf.logging.info("Starting a training cycle: {}/{}".format(
        cycle_index + 1, total_training_cycle))

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

    train_input_fn = producer.make_input_fn(is_training=True)
    estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                    steps=num_train_steps)

    tf.logging.info("Beginning evaluation.")
    eval_input_fn = producer.make_input_fn(is_training=False)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
                            value=cycle_index)
    eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
    tf.logging.info("Evaluation complete.")

    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])
    loss = float(eval_results["loss"])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
                            value={"epoch": cycle_index, "value": hr})
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
            cycle_index + 1, hr, ndcg, loss))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
      target_reached = True
      break

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
  producer.stop_loop()
  producer.join()

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)