def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
                         num_data_readers=None, num_neg=4, epochs_per_cycle=1,
                         match_mlperf=False):
  """Preprocess data and start negative generation subprocess."""

  tf.logging.info("Beginning data preprocessing.")
  ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir,
                                num_data_readers=num_data_readers,
                                match_mlperf=match_mlperf)

  tf.logging.info("Creating training file subprocess.")

  subproc_env = os.environ.copy()

  # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
  # resources and by default will try to allocate GPU memory. This would cause
  # contention with the main training process.
  subproc_env["CUDA_VISIBLE_DEVICES"] = ""

  # By limiting the number of workers we guarantee that the worker
  # pool underlying the training generation doesn't starve other processes.
  num_workers = int(multiprocessing.cpu_count() * 0.75) or 1

  subproc_args = popen_helper.INVOCATION + [
      "--data_dir", data_dir,
      "--cache_id", str(ncf_dataset.cache_paths.cache_id),
      "--num_neg", str(num_neg),
      "--num_train_positives", str(ncf_dataset.num_train_positives),
      "--num_items", str(ncf_dataset.num_items),
      "--num_readers", str(ncf_dataset.num_data_readers),
      "--epochs_per_cycle", str(epochs_per_cycle),
      "--train_batch_size", str(batch_size),
      "--eval_batch_size", str(eval_batch_size),
      "--num_workers", str(num_workers),
      "--spillover", "True",  # This allows the training input function to
                              # guarantee batch size and significantly improves
                              # performance. (~5% increase in examples/sec on
                              # GPU, and needed for TPU XLA.)
      "--redirect_logs", "True",
      "--seed", str(int(stat_utils.random_int32()))
  ]

  tf.logging.info(
      "Generation subprocess command: {}".format(" ".join(subproc_args)))

  proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env)

  atexit.register(_shutdown, proc=proc)
  atexit.register(tf.gfile.DeleteRecursively,
                  ncf_dataset.cache_paths.cache_root)

  for _ in range(15):
    if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
      break
    time.sleep(1)  # allow `alive` file to be written
  if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
    raise ValueError("Generation subprocess did not start correctly. Data will "
                     "not be available; exiting to avoid waiting forever.")

  return ncf_dataset
예제 #2
0
def neumf_model_fn(features, labels, mode, params):
  """Model Function for NeuMF estimator."""
  if params.get("use_seed"):
    tf.set_random_seed(stat_utils.random_int32())

  users = features[movielens.USER_COLUMN]
  items = features[movielens.ITEM_COLUMN]

  user_input = tf.keras.layers.Input(tensor=users)
  item_input = tf.keras.layers.Input(tensor=items)
  logits = construct_model(user_input, item_input, params).output

  # Softmax with the first column of zeros is equivalent to sigmoid.
  softmax_logits = ncf_common.convert_to_softmax_logits(logits)

  if mode == tf.estimator.ModeKeys.EVAL:
    duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
    return _get_estimator_spec_with_metrics(
        logits,
        softmax_logits,
        duplicate_mask,
        params["num_neg"],
        params["match_mlperf"],
        use_tpu_spec=params["use_tpu"])

  elif mode == tf.estimator.ModeKeys.TRAIN:
    labels = tf.cast(labels, tf.int32)
    valid_pt_mask = features[rconst.VALID_POINT_MASK]

    optimizer = tf.compat.v1.train.AdamOptimizer(
        learning_rate=params["learning_rate"],
        beta1=params["beta1"],
        beta2=params["beta2"],
        epsilon=params["epsilon"])
    if params["use_tpu"]:
      optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)

    loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(
        labels=labels,
        logits=softmax_logits,
        weights=tf.cast(valid_pt_mask, tf.float32)
    )

    tf.identity(loss, name="cross_entropy")

    global_step = tf.compat.v1.train.get_global_step()
    tvars = tf.compat.v1.trainable_variables()
    gradients = optimizer.compute_gradients(
        loss, tvars, colocate_gradients_with_ops=True)
    gradients = sparse_to_dense_grads(gradients)
    minimize_op = optimizer.apply_gradients(
        gradients, global_step=global_step, name="train")
    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
    train_op = tf.group(minimize_op, update_ops)

    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

  else:
    raise NotImplementedError
예제 #3
0
def neumf_model_fn(features, labels, mode, params):
    """Model Function for NeuMF estimator."""
    if params.get("use_seed"):
        tf.set_random_seed(stat_utils.random_int32())

    users = features[movielens.USER_COLUMN]
    items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32)

    logits = construct_model(users=users, items=items, params=params)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            movielens.ITEM_COLUMN: items,
            movielens.RATING_COLUMN: logits,
        }

        if params["use_tpu"]:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   predictions=predictions)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    elif mode == tf.estimator.ModeKeys.TRAIN:
        labels = tf.cast(labels, tf.int32)
        optimizer = tf.train.AdamOptimizer(
            learning_rate=params["learning_rate"],
            beta1=params["beta1"],
            beta2=params["beta2"],
            epsilon=params["epsilon"])
        if params["use_tpu"]:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        # Softmax with the first column of ones is equivalent to sigmoid.
        logits = tf.concat([tf.ones(logits.shape, dtype=logits.dtype), logits],
                           axis=1)

        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=logits)

        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        minimize_op = optimizer.apply_gradients(gradients,
                                                global_step=global_step,
                                                name="train")
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops)

        if params["use_tpu"]:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   loss=loss,
                                                   train_op=train_op)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)

    else:
        raise NotImplementedError
예제 #4
0
 def _start_shuffle_iterator(self):
     if self._shuffle_with_forkpool:
         pool = popen_helper.get_forkpool(3, closing=False)
     else:
         pool = popen_helper.get_threadpool(1, closing=False)
     atexit.register(pool.close)
     args = [(self._elements_in_epoch, stat_utils.random_int32())
             for _ in range(self._maximum_number_epochs)]
     imap = pool.imap if self.deterministic else pool.imap_unordered
     self._shuffle_iterator = imap(stat_utils.permutation, args)
예제 #5
0
 def _start_shuffle_iterator(self):
   if self._shuffle_with_forkpool:
     pool = popen_helper.get_forkpool(3, closing=False)
   else:
     pool = popen_helper.get_threadpool(1, closing=False)
   atexit.register(pool.close)
   args = [(self._elements_in_epoch, stat_utils.random_int32())
           for _ in range(self._maximum_number_epochs)]
   imap = pool.imap if self.deterministic else pool.imap_unordered
   self._shuffle_iterator = imap(stat_utils.permutation, args)
예제 #6
0
    def test_shard_randomness(self):
        users = [0, 0, 0, 0, 1, 1, 1, 1]
        items = [0, 2, 4, 6, 0, 2, 4, 6]
        times = [1, 2, 3, 4, 1, 2, 3, 4]
        df = pd.DataFrame({
            movielens.USER_COLUMN: users,
            movielens.ITEM_COLUMN: items,
            movielens.TIMESTAMP_COLUMN: times
        })
        cache_paths = rconst.Paths(data_dir=self.temp_data_dir)
        np.random.seed(1)

        num_shards = 2
        num_items = 10
        data_preprocessing.generate_train_eval_data(
            df,
            approx_num_shards=num_shards,
            num_items=num_items,
            cache_paths=cache_paths,
            match_mlperf=True)

        raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir)
        assert len(raw_shards) == num_shards

        sharded_eval_data = []
        for i in range(2):
            sharded_eval_data.append(
                data_async_generation._process_shard(
                    (os.path.join(cache_paths.train_shard_subdir,
                                  raw_shards[i]),
                     num_items, rconst.NUM_EVAL_NEGATIVES,
                     stat_utils.random_int32(), False, True)))

        if sharded_eval_data[0][0][0] == 1:
            # Order is not assured for this part of the pipeline.
            sharded_eval_data.reverse()

        eval_data = [
            np.concatenate([shard[i] for shard in sharded_eval_data])
            for i in range(3)
        ]
        eval_data = {
            movielens.USER_COLUMN: eval_data[0],
            movielens.ITEM_COLUMN: eval_data[1],
        }

        eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1
        self.assertAllClose(eval_data[movielens.USER_COLUMN],
                            [0] * eval_items_per_user +
                            [1] * eval_items_per_user)

        # Each shard process should generate different random items.
        self.assertNotAllClose(
            eval_data[movielens.ITEM_COLUMN][:eval_items_per_user],
            eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
예제 #7
0
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
                         num_data_readers=None, num_neg=4, epochs_per_cycle=1,
                         match_mlperf=False):
  """Preprocess data and start negative generation subprocess."""

  tf.logging.info("Beginning data preprocessing.")
  ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir,
                                num_data_readers=num_data_readers,
                                match_mlperf=match_mlperf)

  tf.logging.info("Creating training file subprocess.")

  subproc_env = os.environ.copy()

  # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
  # resources and by default will try to allocate GPU memory. This would cause
  # contention with the main training process.
  subproc_env["CUDA_VISIBLE_DEVICES"] = ""

  # By limiting the number of workers we guarantee that the worker
  # pool underlying the training generation doesn't starve other processes.
  num_workers = int(multiprocessing.cpu_count() * 0.75) or 1

  subproc_args = popen_helper.INVOCATION + [
      "--data_dir", data_dir,
      "--cache_id", str(ncf_dataset.cache_paths.cache_id),
      "--num_neg", str(num_neg),
      "--num_train_positives", str(ncf_dataset.num_train_positives),
      "--num_items", str(ncf_dataset.num_items),
      "--num_readers", str(ncf_dataset.num_data_readers),
      "--epochs_per_cycle", str(epochs_per_cycle),
      "--train_batch_size", str(batch_size),
      "--eval_batch_size", str(eval_batch_size),
      "--num_workers", str(num_workers),
      "--spillover", "True",  # This allows the training input function to
                              # guarantee batch size and significantly improves
                              # performance. (~5% increase in examples/sec on
                              # GPU, and needed for TPU XLA.)
      "--redirect_logs", "True",
      "--seed", str(int(stat_utils.random_int32()))
  ]

  tf.logging.info(
      "Generation subprocess command: {}".format(" ".join(subproc_args)))

  proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env)

  atexit.register(_shutdown, proc=proc)
  atexit.register(tf.gfile.DeleteRecursively,
                  ncf_dataset.cache_paths.cache_root)

  return ncf_dataset
예제 #8
0
  def test_shard_randomness(self):
    users = [0, 0, 0, 0, 1, 1, 1, 1]
    items = [0, 2, 4, 6, 0, 2, 4, 6]
    times = [1, 2, 3, 4, 1, 2, 3, 4]
    df = pd.DataFrame({movielens.USER_COLUMN: users,
                       movielens.ITEM_COLUMN: items,
                       movielens.TIMESTAMP_COLUMN: times})
    cache_paths = rconst.Paths(data_dir=self.temp_data_dir)
    np.random.seed(1)

    num_shards = 2
    num_items = 10
    data_preprocessing.generate_train_eval_data(
        df, approx_num_shards=num_shards, num_items=num_items,
        cache_paths=cache_paths, match_mlperf=True)

    raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir)
    assert len(raw_shards) == num_shards

    sharded_eval_data = []
    for i in range(2):
      sharded_eval_data.append(data_async_generation._process_shard(
          (os.path.join(cache_paths.train_shard_subdir, raw_shards[i]),
           num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(),
           False, True)))

    if sharded_eval_data[0][0][0] == 1:
      # Order is not assured for this part of the pipeline.
      sharded_eval_data.reverse()

    eval_data = [np.concatenate([shard[i] for shard in sharded_eval_data])
                 for i in range(3)]
    eval_data = {
        movielens.USER_COLUMN: eval_data[0],
        movielens.ITEM_COLUMN: eval_data[1],
    }

    eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1
    self.assertAllClose(eval_data[movielens.USER_COLUMN],
                        [0] * eval_items_per_user + [1] * eval_items_per_user)

    # Each shard process should generate different random items.
    self.assertNotAllClose(
        eval_data[movielens.ITEM_COLUMN][:eval_items_per_user],
        eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
예제 #9
0
def write_flagfile(flags_, ncf_dataset):
  """Write flagfile to begin async data generation."""
  if ncf_dataset.deterministic:
    flags_["seed"] = stat_utils.random_int32()

  # We write to a temp file then atomically rename it to the final file,
  # because writing directly to the final file can cause the data generation
  # async process to read a partially written JSON file.
  flagfile_temp = os.path.join(ncf_dataset.cache_paths.cache_root,
                               rconst.FLAGFILE_TEMP)
  tf.logging.info("Preparing flagfile for async data generation in {} ..."
                  .format(flagfile_temp))
  with tf.gfile.Open(flagfile_temp, "w") as f:
    for k, v in six.iteritems(flags_):
      f.write("--{}={}\n".format(k, v))
  flagfile = os.path.join(ncf_dataset.cache_paths.cache_root, rconst.FLAGFILE)
  tf.gfile.Rename(flagfile_temp, flagfile)
  tf.logging.info(
      "Wrote flagfile for async data generation in {}.".format(flagfile))
예제 #10
0
def write_flagfile(flags_, ncf_dataset):
  """Write flagfile to begin async data generation."""
  if ncf_dataset.deterministic:
    flags_["seed"] = stat_utils.random_int32()

  # We write to a temp file then atomically rename it to the final file,
  # because writing directly to the final file can cause the data generation
  # async process to read a partially written JSON file.
  flagfile_temp = os.path.join(ncf_dataset.cache_paths.cache_root,
                               rconst.FLAGFILE_TEMP)
  tf.logging.info("Preparing flagfile for async data generation in {} ..."
                  .format(flagfile_temp))
  with tf.gfile.Open(flagfile_temp, "w") as f:
    for k, v in six.iteritems(flags_):
      f.write("--{}={}\n".format(k, v))
  flagfile = os.path.join(ncf_dataset.cache_paths.cache_root, rconst.FLAGFILE)
  tf.gfile.Rename(flagfile_temp, flagfile)
  tf.logging.info(
      "Wrote flagfile for async data generation in {}.".format(flagfile))
예제 #11
0
def neumf_model_fn(features, labels, mode, params):
  """Model Function for NeuMF estimator."""
  if params.get("use_seed"):
    tf.set_random_seed(stat_utils.random_int32())

  users = features[movielens.USER_COLUMN]
  items = features[movielens.ITEM_COLUMN]

  logits = construct_model(users, items, params).output

  # Softmax with the first column of zeros is equivalent to sigmoid.
  softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype),
                              logits], axis=1)

  if mode == tf.estimator.ModeKeys.EVAL:
    duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
    return compute_eval_loss_and_metrics(
        logits, softmax_logits, duplicate_mask, params["num_neg"],
        params["match_mlperf"],
        use_tpu_spec=params["use_xla_for_gpu"])

  elif mode == tf.estimator.ModeKeys.TRAIN:
    labels = tf.cast(labels, tf.int32)
    valid_pt_mask = features[rconst.VALID_POINT_MASK]

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
                            value=params["learning_rate"])
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1,
                            value=params["beta1"])
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2,
                            value=params["beta2"])
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                            value=params["epsilon"])

    optimizer = tf.train.AdamOptimizer(
        learning_rate=params["learning_rate"], beta1=params["beta1"],
        beta2=params["beta2"], epsilon=params["epsilon"])
    if params["use_tpu"]:
      optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN,
                            value=mlperf_helper.TAGS.BCE)
    loss = tf.losses.sparse_softmax_cross_entropy(
        labels=labels,
        logits=softmax_logits,
        weights=tf.cast(valid_pt_mask, tf.float32)
    )

    # This tensor is used by logging hooks.
    tf.identity(loss, name="cross_entropy")

    global_step = tf.train.get_global_step()
    tvars = tf.trainable_variables()
    gradients = optimizer.compute_gradients(
        loss, tvars, colocate_gradients_with_ops=True)
    gradients = _sparse_to_dense_grads(gradients)
    minimize_op = optimizer.apply_gradients(
        gradients, global_step=global_step, name="train")
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    train_op = tf.group(minimize_op, update_ops)

    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

  else:
    raise NotImplementedError
예제 #12
0
def neumf_model_fn(features, labels, mode, params):
    """Model Function for NeuMF estimator."""
    if params.get("use_seed"):
        tf.set_random_seed(stat_utils.random_int32())

    users = features[movielens.USER_COLUMN]
    items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32)

    logits = construct_model(users=users, items=items, params=params)

    # Softmax with the first column of zeros is equivalent to sigmoid.
    softmax_logits = tf.concat(
        [tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            movielens.ITEM_COLUMN: items,
            movielens.RATING_COLUMN: logits,
        }

        if params["use_tpu"]:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   predictions=predictions)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    elif mode == tf.estimator.ModeKeys.EVAL:
        duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
        return compute_eval_loss_and_metrics(logits,
                                             softmax_logits,
                                             duplicate_mask,
                                             params["num_neg"],
                                             params["match_mlperf"],
                                             use_tpu_spec=params["use_tpu"]
                                             or params["use_xla_for_gpu"])

    elif mode == tf.estimator.ModeKeys.TRAIN:
        labels = tf.cast(labels, tf.int32)
        optimizer = tf.train.AdamOptimizer(
            learning_rate=params["learning_rate"],
            beta1=params["beta1"],
            beta2=params["beta2"],
            epsilon=params["epsilon"])
        if params["use_tpu"]:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=softmax_logits)

        # This tensor is used by logging hooks.
        tf.identity(loss, name="cross_entropy")

        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        gradients = _sparse_to_dense_grads(gradients)
        minimize_op = optimizer.apply_gradients(gradients,
                                                global_step=global_step,
                                                name="train")
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops)

        if params["use_tpu"]:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   loss=loss,
                                                   train_op=train_op)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)

    else:
        raise NotImplementedError
예제 #13
0
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
                         num_data_readers=None, num_neg=4, epochs_per_cycle=1,
                         match_mlperf=False, deterministic=False,
                         use_subprocess=True, cache_id=None):
  # type: (...) -> (NCFDataset, typing.Callable)
  """Preprocess data and start negative generation subprocess."""

  tf.logging.info("Beginning data preprocessing.")
  ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir,
                                num_data_readers=num_data_readers,
                                match_mlperf=match_mlperf,
                                deterministic=deterministic,
                                cache_id=cache_id)
  # By limiting the number of workers we guarantee that the worker
  # pool underlying the training generation doesn't starve other processes.
  num_workers = int(multiprocessing.cpu_count() * 0.75) or 1

  flags_ = {
      "data_dir": data_dir,
      "cache_id": ncf_dataset.cache_paths.cache_id,
      "num_neg": num_neg,
      "num_train_positives": ncf_dataset.num_train_positives,
      "num_items": ncf_dataset.num_items,
      "num_users": ncf_dataset.num_users,
      "num_readers": ncf_dataset.num_data_readers,
      "epochs_per_cycle": epochs_per_cycle,
      "train_batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "num_workers": num_workers,
      "redirect_logs": use_subprocess,
      "use_tf_logging": not use_subprocess,
      "ml_perf": match_mlperf,
  }

  if ncf_dataset.deterministic:
    flags_["seed"] = stat_utils.random_int32()
  tf.gfile.MakeDirs(data_dir)
  # We write to a temp file then atomically rename it to the final file,
  # because writing directly to the final file can cause the data generation
  # async process to read a partially written JSON file.
  flagfile_temp = os.path.join(ncf_dataset.cache_paths.cache_root,
                               rconst.FLAGFILE_TEMP)
  tf.logging.info("Preparing flagfile for async data generation in {} ..."
                  .format(flagfile_temp))
  with tf.gfile.Open(flagfile_temp, "w") as f:
    for k, v in six.iteritems(flags_):
      f.write("--{}={}\n".format(k, v))
  flagfile = os.path.join(ncf_dataset.cache_paths.cache_root, rconst.FLAGFILE)
  tf.gfile.Rename(flagfile_temp, flagfile)
  tf.logging.info(
      "Wrote flagfile for async data generation in {}."
      .format(flagfile))

  if use_subprocess:
    tf.logging.info("Creating training file subprocess.")
    subproc_env = os.environ.copy()
    # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
    # resources and by default will try to allocate GPU memory. This would cause
    # contention with the main training process.
    subproc_env["CUDA_VISIBLE_DEVICES"] = ""
    subproc_args = popen_helper.INVOCATION + [
        "--data_dir", data_dir,
        "--cache_id", str(ncf_dataset.cache_paths.cache_id)]
    tf.logging.info(
        "Generation subprocess command: {}".format(" ".join(subproc_args)))
    proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env)

  cleanup_called = {"finished": False}
  @atexit.register
  def cleanup():
    """Remove files and subprocess from data generation."""
    if cleanup_called["finished"]:
      return

    if use_subprocess:
      _shutdown(proc)

    try:
      tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root)
    except tf.errors.NotFoundError:
      pass

    cleanup_called["finished"] = True

  for _ in range(300):
    if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
      break
    time.sleep(1)  # allow `alive` file to be written
  if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
    raise ValueError("Generation subprocess did not start correctly. Data will "
                     "not be available; exiting to avoid waiting forever.")

  return ncf_dataset, cleanup
예제 #14
0
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
                         num_data_readers=None, num_neg=4, epochs_per_cycle=1,
                         match_mlperf=False, deterministic=False):
  # type: (...) -> (NCFDataset, typing.Callable)
  """Preprocess data and start negative generation subprocess."""

  tf.logging.info("Beginning data preprocessing.")
  ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir,
                                num_data_readers=num_data_readers,
                                match_mlperf=match_mlperf,
                                deterministic=deterministic)

  tf.logging.info("Creating training file subprocess.")

  subproc_env = os.environ.copy()

  # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
  # resources and by default will try to allocate GPU memory. This would cause
  # contention with the main training process.
  subproc_env["CUDA_VISIBLE_DEVICES"] = ""

  # By limiting the number of workers we guarantee that the worker
  # pool underlying the training generation doesn't starve other processes.
  num_workers = int(multiprocessing.cpu_count() * 0.75) or 1

  subproc_args = popen_helper.INVOCATION + [
      "--data_dir", data_dir,
      "--cache_id", str(ncf_dataset.cache_paths.cache_id),
      "--num_neg", str(num_neg),
      "--num_train_positives", str(ncf_dataset.num_train_positives),
      "--num_items", str(ncf_dataset.num_items),
      "--num_readers", str(ncf_dataset.num_data_readers),
      "--epochs_per_cycle", str(epochs_per_cycle),
      "--train_batch_size", str(batch_size),
      "--eval_batch_size", str(eval_batch_size),
      "--num_workers", str(num_workers),
      "--spillover", "True",  # This allows the training input function to
                              # guarantee batch size and significantly improves
                              # performance. (~5% increase in examples/sec on
                              # GPU, and needed for TPU XLA.)
      "--redirect_logs", "True"
  ]
  if ncf_dataset.deterministic:
    subproc_args.extend(["--seed", str(int(stat_utils.random_int32()))])

  tf.logging.info(
      "Generation subprocess command: {}".format(" ".join(subproc_args)))

  proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env)

  cleanup_called = {"finished": False}
  @atexit.register
  def cleanup():
    """Remove files and subprocess from data generation."""
    if cleanup_called["finished"]:
      return

    _shutdown(proc)
    try:
      tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root)
    except tf.errors.NotFoundError:
      pass

    cleanup_called["finished"] = True

  for _ in range(300):
    if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
      break
    time.sleep(1)  # allow `alive` file to be written
  if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
    raise ValueError("Generation subprocess did not start correctly. Data will "
                     "not be available; exiting to avoid waiting forever.")

  return ncf_dataset, cleanup
예제 #15
0
def neumf_model_fn(features, labels, mode, params):
  """Model Function for NeuMF estimator."""
  if params.get("use_seed"):
    tf.set_random_seed(stat_utils.random_int32())

  users = features[movielens.USER_COLUMN]
  items = features[movielens.ITEM_COLUMN]

  logits = construct_model(users, items, params).output

  # Softmax with the first column of zeros is equivalent to sigmoid.
  softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype),
                              logits], axis=1)

  if mode == tf.estimator.ModeKeys.EVAL:
      duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
      return compute_eval_loss_and_metrics(
          logits, softmax_logits, duplicate_mask, params["num_neg"],
          params["match_mlperf"],
          use_tpu_spec=params["use_xla_for_gpu"])

  if mode == tf.estimator.ModeKeys.TRAIN:
      labels = tf.cast(labels, tf.int32)
      valid_pt_mask = features[rconst.VALID_POINT_MASK]

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
                              value=params["learning_rate"])
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1,
                              value=params["beta1"])
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2,
                              value=params["beta2"])
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                              value=params["epsilon"])

      optimizer = tf.train.AdamOptimizer(
          learning_rate=params["learning_rate"], beta1=params["beta1"],
          beta2=params["beta2"], epsilon=params["epsilon"])
      if params["use_tpu"]:
          optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN,
                              value=mlperf_helper.TAGS.BCE)
      loss = tf.losses.sparse_softmax_cross_entropy(
          labels=labels,
          logits=softmax_logits,
          weights=tf.cast(valid_pt_mask, tf.float32)
      )

      # This tensor is used by logging hooks.
      tf.identity(loss, name="cross_entropy")

      global_step = tf.train.get_global_step()
      tvars = tf.trainable_variables()
      gradients = optimizer.compute_gradients(
          loss, tvars, colocate_gradients_with_ops=True)
      gradients = _sparse_to_dense_grads(gradients)
      minimize_op = optimizer.apply_gradients(
          gradients, global_step=global_step, name="train")
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      train_op = tf.group(minimize_op, update_ops)

      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
  raise NotImplementedError
예제 #16
0
def neumf_model_fn(features, labels, mode, params):
    """Model Function for NeuMF estimator."""
    if params.get("use_seed"):
        tf.set_random_seed(stat_utils.random_int32())

    users = features[movielens.USER_COLUMN]
    items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32)

    keras_model = params.get("keras_model")
    if keras_model:
        logits = keras_model([users, items],
                             training=mode == tf.estimator.ModeKeys.TRAIN)
    else:
        keras_model = construct_model(users=users, items=items, params=params)
        logits = keras_model.output
    if not params["use_estimator"] and "keras_model" not in params:
        # When we are not using estimator, we need to reuse the Keras model when
        # this model_fn is called again, so that the variables are shared between
        # training and eval. So we mutate params to add the Keras model.
        params["keras_model"] = keras_model

    # Softmax with the first column of zeros is equivalent to sigmoid.
    softmax_logits = tf.concat(
        [tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            movielens.ITEM_COLUMN: items,
            movielens.RATING_COLUMN: logits,
        }

        if params["use_tpu"]:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   predictions=predictions)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    elif mode == tf.estimator.ModeKeys.EVAL:
        duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
        return compute_eval_loss_and_metrics(logits,
                                             softmax_logits,
                                             duplicate_mask,
                                             params["num_neg"],
                                             params["match_mlperf"],
                                             use_tpu_spec=params["use_tpu"]
                                             or params["use_xla_for_gpu"])

    elif mode == tf.estimator.ModeKeys.TRAIN:
        labels = tf.cast(labels, tf.int32)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
                                value=params["learning_rate"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1,
                                value=params["beta1"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2,
                                value=params["beta2"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                                value=params["epsilon"])

        optimizer = tf.train.AdamOptimizer(
            learning_rate=params["learning_rate"],
            beta1=params["beta1"],
            beta2=params["beta2"],
            epsilon=params["epsilon"])
        if params["use_tpu"]:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN,
                                value=mlperf_helper.TAGS.BCE)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=softmax_logits)

        # This tensor is used by logging hooks.
        tf.identity(loss, name="cross_entropy")

        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        gradients = _sparse_to_dense_grads(gradients)
        minimize_op = optimizer.apply_gradients(gradients,
                                                global_step=global_step,
                                                name="train")
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops)

        if params["use_tpu"]:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   loss=loss,
                                                   train_op=train_op)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)

    else:
        raise NotImplementedError
예제 #17
0
def _construct_records(
        is_training,  # type: bool
        train_cycle,  # type: typing.Optional[int]
        num_workers,  # type: int
        cache_paths,  # type: rconst.Paths
        num_readers,  # type: int
        num_neg,  # type: int
        num_positives,  # type: int
        num_items,  # type: int
        epochs_per_cycle,  # type: int
        batch_size,  # type: int
        training_shards,  # type: typing.List[str]
        deterministic=False,  # type: bool
        match_mlperf=False  # type: bool
):
    """Generate false negatives and write TFRecords files.

  Args:
    is_training: Are training records (True) or eval records (False) created.
    train_cycle: Integer of which cycle the generated data is for.
    num_workers: Number of multiprocessing workers to use for negative
      generation.
    cache_paths: Paths object with information of where to write files.
    num_readers: The number of reader datasets in the input_fn. This number is
      approximate; fewer shards will be created if not all shards are assigned
      batches. This can occur due to discretization in the assignment process.
    num_neg: The number of false negatives per positive example.
    num_positives: The number of positive examples. This value is used
      to pre-allocate arrays while the imap is still running. (NumPy does not
      allow dynamic arrays.)
    num_items: The cardinality of the item set.
    epochs_per_cycle: The number of epochs worth of data to construct.
    batch_size: The expected batch size used during training. This is used
      to properly batch data when writing TFRecords.
    training_shards: The picked positive examples from which to generate
      negatives.
  """
    st = timeit.default_timer()

    if is_training:
        mlperf_helper.ncf_print(
            key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN)
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG,
                                value=num_neg)

        # set inside _process_shard()
        mlperf_helper.ncf_print(
            key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT,
            value=True)

    else:
        # Later logic assumes that all items for a given user are in the same batch.
        assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1)
        assert num_neg == rconst.NUM_EVAL_NEGATIVES

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                value=num_positives)

    assert epochs_per_cycle == 1 or is_training
    num_workers = min([num_workers, len(training_shards) * epochs_per_cycle])

    num_pts = num_positives * (1 + num_neg)

    # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without
    # precision concerns
    num_pts_with_padding = (num_pts + batch_size -
                            1) // batch_size * batch_size
    num_padding = num_pts_with_padding - num_pts

    # We choose a different random seed for each process, so that the processes
    # will not all choose the same random numbers.
    process_seeds = [
        stat_utils.random_int32() for _ in training_shards * epochs_per_cycle
    ]
    map_args = [(shard, num_items, num_neg, process_seeds[i], is_training,
                 match_mlperf)
                for i, shard in enumerate(training_shards * epochs_per_cycle)]

    with popen_helper.get_pool(num_workers, init_worker) as pool:
        map_fn = pool.imap if deterministic else pool.imap_unordered  # pylint: disable=no-member
        data_generator = map_fn(_process_shard, map_args)
        data = [
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.int32) - 1,
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.uint16),
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.int8),
        ]

        # Training data is shuffled. Evaluation data MUST not be shuffled.
        # Downstream processing depends on the fact that evaluation data for a given
        # user is grouped within a batch.
        if is_training:
            index_destinations = np.random.permutation(num_pts)
            mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
        else:
            index_destinations = np.arange(num_pts)

        start_ind = 0
        for data_segment in data_generator:
            n_in_segment = data_segment[0].shape[0]
            dest = index_destinations[start_ind:start_ind + n_in_segment]
            start_ind += n_in_segment
            for i in range(3):
                data[i][dest] = data_segment[i]

    assert np.sum(data[0] == -1) == num_padding

    if is_training:
        if num_padding:
            # In order to have a full batch, randomly include points from earlier in
            # the batch.

            mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
            pad_sample_indices = np.random.randint(low=0,
                                                   high=num_pts,
                                                   size=(num_padding, ))
            dest = np.arange(start=start_ind, stop=start_ind + num_padding)
            start_ind += num_padding
            for i in range(3):
                data[i][dest] = data[i][pad_sample_indices]
    else:
        # For Evaluation, padding is all zeros. The evaluation input_fn knows how
        # to interpret and discard the zero padded entries.
        data[0][num_pts:] = 0

    # Check that no points were overlooked.
    assert not np.sum(data[0] == -1)

    if is_training:
        # The number of points is slightly larger than num_pts due to padding.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE,
                                value=int(data[0].shape[0]))
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE,
                                value=batch_size)
    else:
        # num_pts is logged instead of int(data[0].shape[0]), because the size
        # of the data vector includes zero pads which are ignored.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE,
                                value=num_pts)

    batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers)
    current_file_id = -1
    current_batch_id = -1
    batches_by_file = [[] for _ in range(num_readers)]

    while True:
        current_batch_id += 1
        if (current_batch_id % batches_per_file) == 0:
            current_file_id += 1

        start_ind = current_batch_id * batch_size
        end_ind = start_ind + batch_size
        if end_ind > num_pts_with_padding:
            if start_ind != num_pts_with_padding:
                raise ValueError("Batch padding does not line up")
            break
        batches_by_file[current_file_id].append(current_batch_id)

    # Drop shards which were not assigned batches
    batches_by_file = [i for i in batches_by_file if i]
    num_readers = len(batches_by_file)

    if is_training:
        # Empirically it is observed that placing the batch with repeated values at
        # the start rather than the end improves convergence.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
        batches_by_file[0][0], batches_by_file[-1][-1] = \
          batches_by_file[-1][-1], batches_by_file[0][0]

    if is_training:
        template = rconst.TRAIN_RECORD_TEMPLATE
        record_dir = os.path.join(cache_paths.train_epoch_dir,
                                  get_cycle_folder_name(train_cycle))
        tf.gfile.MakeDirs(record_dir)
    else:
        template = rconst.EVAL_RECORD_TEMPLATE
        record_dir = cache_paths.eval_data_subdir

    batch_count = 0
    for i in range(num_readers):
        fpath = os.path.join(record_dir, template.format(i))
        log_msg("Writing {}".format(fpath))
        with tf.python_io.TFRecordWriter(fpath) as writer:
            for j in batches_by_file[i]:
                start_ind = j * batch_size
                end_ind = start_ind + batch_size
                record_kwargs = dict(
                    users=data[0][start_ind:end_ind],
                    items=data[1][start_ind:end_ind],
                )

                if is_training:
                    record_kwargs["labels"] = data[2][start_ind:end_ind]
                else:
                    record_kwargs["dupe_mask"] = stat_utils.mask_duplicates(
                        record_kwargs["items"].reshape(-1, num_neg + 1),
                        axis=1).flatten().astype(np.int8)

                batch_bytes = _construct_record(**record_kwargs)

                writer.write(batch_bytes)
                batch_count += 1

    # We write to a temp file then atomically rename it to the final file, because
    # writing directly to the final file can cause the main process to read a
    # partially written JSON file.
    ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
    with tf.gfile.Open(ready_file_temp, "w") as f:
        json.dump({
            "batch_size": batch_size,
            "batch_count": batch_count,
        }, f)
    ready_file = os.path.join(record_dir, rconst.READY_FILE)
    tf.gfile.Rename(ready_file_temp, ready_file)

    if is_training:
        log_msg("Cycle {} complete. Total time: {:.1f} seconds".format(
            train_cycle,
            timeit.default_timer() - st))
    else:
        log_msg(
            "Eval construction complete. Total time: {:.1f} seconds".format(
                timeit.default_timer() - st))
def instantiate_pipeline(dataset,
                         data_dir,
                         batch_size,
                         eval_batch_size,
                         num_data_readers=None,
                         num_neg=4,
                         epochs_per_cycle=1):
    """Preprocess data and start negative generation subprocess."""

    movielens.download(dataset=dataset, data_dir=data_dir)
    tf.logging.info("Beginning data preprocessing.")
    ncf_dataset = construct_cache(dataset=dataset,
                                  data_dir=data_dir,
                                  num_data_readers=num_data_readers)

    tf.logging.info("Creating training file subprocess.")

    subproc_env = os.environ.copy()

    # The subprocess uses TensorFlow for tf.gfile, but it does not need GPU
    # resources and by default will try to allocate GPU memory. This would cause
    # contention with the main training process.
    subproc_env["CUDA_VISIBLE_DEVICES"] = ""

    python = "python3" if six.PY3 else "python2"

    # By limiting the number of workers we guarantee that the worker
    # pool underlying the training generation doesn't starve other processes.
    num_workers = int(multiprocessing.cpu_count() * 0.75)

    subproc_args = [
        python,
        _ASYNC_GEN_PATH,
        "--data_dir",
        data_dir,
        "--cache_id",
        str(ncf_dataset.cache_paths.cache_id),
        "--num_neg",
        str(num_neg),
        "--num_train_positives",
        str(ncf_dataset.num_train_positives),
        "--num_items",
        str(ncf_dataset.num_items),
        "--num_readers",
        str(ncf_dataset.num_data_readers),
        "--epochs_per_cycle",
        str(epochs_per_cycle),
        "--train_batch_size",
        str(batch_size),
        "--eval_batch_size",
        str(eval_batch_size),
        "--num_workers",
        str(num_workers),
        "--spillover",
        "True",  # This allows the training input function to
        # guarantee batch size and significantly improves
        # performance. (~5% increase in examples/sec on
        # GPU, and needed for TPU XLA.)
        "--redirect_logs",
        "True",
        "--seed",
        str(int(stat_utils.random_int32()))
    ]

    tf.logging.info("Generation subprocess command: {}".format(
        " ".join(subproc_args)))

    proc = subprocess.Popen(args=subproc_args,
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            shell=False,
                            env=subproc_env)

    atexit.register(_shutdown, proc=proc)
    atexit.register(tf.gfile.DeleteRecursively,
                    ncf_dataset.cache_paths.cache_root)

    return ncf_dataset
예제 #19
0
def _construct_records(
    is_training,          # type: bool
    train_cycle,          # type: typing.Optional[int]
    num_workers,          # type: int
    cache_paths,          # type: rconst.Paths
    num_readers,          # type: int
    num_neg,              # type: int
    num_positives,        # type: int
    num_items,            # type: int
    epochs_per_cycle,     # type: int
    batch_size,           # type: int
    training_shards,      # type: typing.List[str]
    deterministic=False,  # type: bool
    match_mlperf=False    # type: bool
    ):
  """Generate false negatives and write TFRecords files.

  Args:
    is_training: Are training records (True) or eval records (False) created.
    train_cycle: Integer of which cycle the generated data is for.
    num_workers: Number of multiprocessing workers to use for negative
      generation.
    cache_paths: Paths object with information of where to write files.
    num_readers: The number of reader datasets in the input_fn. This number is
      approximate; fewer shards will be created if not all shards are assigned
      batches. This can occur due to discretization in the assignment process.
    num_neg: The number of false negatives per positive example.
    num_positives: The number of positive examples. This value is used
      to pre-allocate arrays while the imap is still running. (NumPy does not
      allow dynamic arrays.)
    num_items: The cardinality of the item set.
    epochs_per_cycle: The number of epochs worth of data to construct.
    batch_size: The expected batch size used during training. This is used
      to properly batch data when writing TFRecords.
    training_shards: The picked positive examples from which to generate
      negatives.
  """
  st = timeit.default_timer()

  if is_training:
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN)
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg)

    # set inside _process_shard()
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)

  else:
    # Later logic assumes that all items for a given user are in the same batch.
    assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1)
    assert num_neg == rconst.NUM_EVAL_NEGATIVES

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            value=num_positives)

  assert epochs_per_cycle == 1 or is_training
  num_workers = min([num_workers, len(training_shards) * epochs_per_cycle])

  num_pts = num_positives * (1 + num_neg)

  # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without
  # precision concerns
  num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size
  num_padding = num_pts_with_padding - num_pts

  # We choose a different random seed for each process, so that the processes
  # will not all choose the same random numbers.
  process_seeds = [stat_utils.random_int32()
                   for _ in training_shards * epochs_per_cycle]
  map_args = [
      (shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf)
      for i, shard in enumerate(training_shards * epochs_per_cycle)]

  with popen_helper.get_pool(num_workers, init_worker) as pool:
    map_fn = pool.imap if deterministic else pool.imap_unordered  # pylint: disable=no-member
    data_generator = map_fn(_process_shard, map_args)
    data = [
        np.zeros(shape=(num_pts_with_padding,), dtype=np.int32) - 1,
        np.zeros(shape=(num_pts_with_padding,), dtype=np.uint16),
        np.zeros(shape=(num_pts_with_padding,), dtype=np.int8),
    ]

    # Training data is shuffled. Evaluation data MUST not be shuffled.
    # Downstream processing depends on the fact that evaluation data for a given
    # user is grouped within a batch.
    if is_training:
      index_destinations = np.random.permutation(num_pts)
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
    else:
      index_destinations = np.arange(num_pts)

    start_ind = 0
    for data_segment in data_generator:
      n_in_segment = data_segment[0].shape[0]
      dest = index_destinations[start_ind:start_ind + n_in_segment]
      start_ind += n_in_segment
      for i in range(3):
        data[i][dest] = data_segment[i]

  assert np.sum(data[0] == -1) == num_padding

  if is_training:
    if num_padding:
      # In order to have a full batch, randomly include points from earlier in
      # the batch.

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
      pad_sample_indices = np.random.randint(
          low=0, high=num_pts, size=(num_padding,))
      dest = np.arange(start=start_ind, stop=start_ind + num_padding)
      start_ind += num_padding
      for i in range(3):
        data[i][dest] = data[i][pad_sample_indices]
  else:
    # For Evaluation, padding is all zeros. The evaluation input_fn knows how
    # to interpret and discard the zero padded entries.
    data[0][num_pts:] = 0

  # Check that no points were overlooked.
  assert not np.sum(data[0] == -1)

  if is_training:
    # The number of points is slightly larger than num_pts due to padding.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE,
                            value=int(data[0].shape[0]))
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE,
                            value=batch_size)
  else:
    # num_pts is logged instead of int(data[0].shape[0]), because the size
    # of the data vector includes zero pads which are ignored.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts)

  batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers)
  current_file_id = -1
  current_batch_id = -1
  batches_by_file = [[] for _ in range(num_readers)]

  while True:
    current_batch_id += 1
    if (current_batch_id % batches_per_file) == 0:
      current_file_id += 1

    start_ind = current_batch_id * batch_size
    end_ind = start_ind + batch_size
    if end_ind > num_pts_with_padding:
      if start_ind != num_pts_with_padding:
        raise ValueError("Batch padding does not line up")
      break
    batches_by_file[current_file_id].append(current_batch_id)

  # Drop shards which were not assigned batches
  batches_by_file = [i for i in batches_by_file if i]
  num_readers = len(batches_by_file)

  if is_training:
    # Empirically it is observed that placing the batch with repeated values at
    # the start rather than the end improves convergence.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
    batches_by_file[0][0], batches_by_file[-1][-1] = \
      batches_by_file[-1][-1], batches_by_file[0][0]

  if is_training:
    template = rconst.TRAIN_RECORD_TEMPLATE
    record_dir = os.path.join(cache_paths.train_epoch_dir,
                              get_cycle_folder_name(train_cycle))
    tf.gfile.MakeDirs(record_dir)
  else:
    template = rconst.EVAL_RECORD_TEMPLATE
    record_dir = cache_paths.eval_data_subdir

  batch_count = 0
  for i in range(num_readers):
    fpath = os.path.join(record_dir, template.format(i))
    log_msg("Writing {}".format(fpath))
    with tf.python_io.TFRecordWriter(fpath) as writer:
      for j in batches_by_file[i]:
        start_ind = j * batch_size
        end_ind = start_ind + batch_size
        record_kwargs = dict(
            users=data[0][start_ind:end_ind],
            items=data[1][start_ind:end_ind],
        )

        if is_training:
          record_kwargs["labels"] = data[2][start_ind:end_ind]
        else:
          record_kwargs["dupe_mask"] = stat_utils.mask_duplicates(
              record_kwargs["items"].reshape(-1, num_neg + 1),
              axis=1).flatten().astype(np.int8)

        batch_bytes = _construct_record(**record_kwargs)

        writer.write(batch_bytes)
        batch_count += 1

  # We write to a temp file then atomically rename it to the final file, because
  # writing directly to the final file can cause the main process to read a
  # partially written JSON file.
  ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
  with tf.gfile.Open(ready_file_temp, "w") as f:
    json.dump({
        "batch_size": batch_size,
        "batch_count": batch_count,
    }, f)
  ready_file = os.path.join(record_dir, rconst.READY_FILE)
  tf.gfile.Rename(ready_file_temp, ready_file)

  if is_training:
    log_msg("Cycle {} complete. Total time: {:.1f} seconds"
            .format(train_cycle, timeit.default_timer() - st))
  else:
    log_msg("Eval construction complete. Total time: {:.1f} seconds"
            .format(timeit.default_timer() - st))