Exemplo n.º 1
0
def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
                             match_mlperf):
  # type: (pd.DataFrame, int, int, rconst.Paths, bool) -> None
  """Construct training and evaluation datasets.

  This function manages dataset construction and validation that the
  transformations have produced correct results. The particular logic of
  transforming the data is performed in _train_eval_map_fn().

  Args:
    df: The dataframe containing the entire dataset. It is essential that this
      dataframe be produced by _filter_index_sort(), as subsequent
      transformations rely on `df` having particular structure.
    approx_num_shards: The approximate number of similarly sized shards to
      construct from `df`. The MovieLens has severe imbalances where some users
      have interacted with many items; this is common among datasets involving
      user data. Rather than attempt to aggressively balance shard size, this
      function simply allows shards to "overflow" which can produce a number of
      shards which is less than `approx_num_shards`. This small degree of
      imbalance does not impact performance; however it does mean that one
      should not expect approx_num_shards to be the ACTUAL number of shards.
    num_items: The cardinality of the item set.
    cache_paths: rconst.Paths object containing locations for various cache
      files.
    match_mlperf: If True, sample eval negative with replacements, which the
      MLPerf reference implementation does.
  """

  num_rows = len(df)
  approximate_partitions = np.linspace(
      0, num_rows, approx_num_shards + 1).astype("int")
  start_ind, end_ind = 0, 0
  shards = []

  for i in range(1, approx_num_shards + 1):
    end_ind = approximate_partitions[i]
    while (end_ind < num_rows and df[movielens.USER_COLUMN][end_ind - 1] ==
           df[movielens.USER_COLUMN][end_ind]):
      end_ind += 1

    if end_ind <= start_ind:
      continue  # imbalance from prior shard.

    df_shard = df[start_ind:end_ind]
    user_shard = df_shard[movielens.USER_COLUMN].values.astype(np.int32)
    item_shard = df_shard[movielens.ITEM_COLUMN].values.astype(np.uint16)

    shards.append({
        movielens.USER_COLUMN: user_shard,
        movielens.ITEM_COLUMN: item_shard,
    })

    start_ind = end_ind
  assert end_ind == num_rows
  approx_num_shards = len(shards)

  tf.logging.info("Splitting train and test data and generating {} test "
                  "negatives per user...".format(rconst.NUM_EVAL_NEGATIVES))
  tf.gfile.MakeDirs(cache_paths.train_shard_subdir)

  map_args = [(shards[i], i, num_items, cache_paths)
              for i in range(approx_num_shards)]

  with popen_helper.get_pool(multiprocessing.cpu_count()) as pool:
    pool.map(_train_eval_map_fn, map_args)  # pylint: disable=no-member
Exemplo n.º 2
0
def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
                             match_mlperf):
  # type: (pd.DataFrame, int, int, rconst.Paths, bool) -> None
  """Construct training and evaluation datasets.

  This function manages dataset construction and validation that the
  transformations have produced correct results. The particular logic of
  transforming the data is performed in _train_eval_map_fn().

  Args:
    df: The dataframe containing the entire dataset. It is essential that this
      dataframe be produced by _filter_index_sort(), as subsequent
      transformations rely on `df` having particular structure.
    approx_num_shards: The approximate number of similarly sized shards to
      construct from `df`. The MovieLens has severe imbalances where some users
      have interacted with many items; this is common among datasets involving
      user data. Rather than attempt to aggressively balance shard size, this
      function simply allows shards to "overflow" which can produce a number of
      shards which is less than `approx_num_shards`. This small degree of
      imbalance does not impact performance; however it does mean that one
      should not expect approx_num_shards to be the ACTUAL number of shards.
    num_items: The cardinality of the item set.
    cache_paths: rconst.Paths object containing locations for various cache
      files.
    match_mlperf: If True, sample eval negative with replacements, which the
      MLPerf reference implementation does.
  """

  num_rows = len(df)
  approximate_partitions = np.linspace(
      0, num_rows, approx_num_shards + 1).astype("int")
  start_ind, end_ind = 0, 0
  shards = []

  for i in range(1, approx_num_shards + 1):
    end_ind = approximate_partitions[i]
    while (end_ind < num_rows and df[movielens.USER_COLUMN][end_ind - 1] ==
           df[movielens.USER_COLUMN][end_ind]):
      end_ind += 1

    if end_ind <= start_ind:
      continue  # imbalance from prior shard.

    df_shard = df[start_ind:end_ind]
    user_shard = df_shard[movielens.USER_COLUMN].values.astype(np.int32)
    item_shard = df_shard[movielens.ITEM_COLUMN].values.astype(np.uint16)

    shards.append({
        movielens.USER_COLUMN: user_shard,
        movielens.ITEM_COLUMN: item_shard,
    })

    start_ind = end_ind
  assert end_ind == num_rows
  approx_num_shards = len(shards)

  tf.logging.info("Splitting train and test data and generating {} test "
                  "negatives per user...".format(rconst.NUM_EVAL_NEGATIVES))
  tf.gfile.MakeDirs(cache_paths.train_shard_subdir)

  map_args = [(shards[i], i, num_items, cache_paths)
              for i in range(approx_num_shards)]

  with popen_helper.get_pool(multiprocessing.cpu_count()) as pool:
    pool.map(_train_eval_map_fn, map_args)  # pylint: disable=no-member
Exemplo n.º 3
0
def _construct_records(
    is_training,          # type: bool
    train_cycle,          # type: typing.Optional[int]
    num_workers,          # type: int
    cache_paths,          # type: rconst.Paths
    num_readers,          # type: int
    num_neg,              # type: int
    num_positives,        # type: int
    num_items,            # type: int
    epochs_per_cycle,     # type: int
    batch_size,           # type: int
    training_shards,      # type: typing.List[str]
    deterministic=False,  # type: bool
    match_mlperf=False    # type: bool
    ):
  """Generate false negatives and write TFRecords files.

  Args:
    is_training: Are training records (True) or eval records (False) created.
    train_cycle: Integer of which cycle the generated data is for.
    num_workers: Number of multiprocessing workers to use for negative
      generation.
    cache_paths: Paths object with information of where to write files.
    num_readers: The number of reader datasets in the input_fn. This number is
      approximate; fewer shards will be created if not all shards are assigned
      batches. This can occur due to discretization in the assignment process.
    num_neg: The number of false negatives per positive example.
    num_positives: The number of positive examples. This value is used
      to pre-allocate arrays while the imap is still running. (NumPy does not
      allow dynamic arrays.)
    num_items: The cardinality of the item set.
    epochs_per_cycle: The number of epochs worth of data to construct.
    batch_size: The expected batch size used during training. This is used
      to properly batch data when writing TFRecords.
    training_shards: The picked positive examples from which to generate
      negatives.
  """
  st = timeit.default_timer()

  if is_training:
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN)
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg)

    # set inside _process_shard()
    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)

  else:
    # Later logic assumes that all items for a given user are in the same batch.
    assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1)
    assert num_neg == rconst.NUM_EVAL_NEGATIVES

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN)

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                            value=num_positives)

  assert epochs_per_cycle == 1 or is_training
  num_workers = min([num_workers, len(training_shards) * epochs_per_cycle])

  num_pts = num_positives * (1 + num_neg)

  # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without
  # precision concerns
  num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size
  num_padding = num_pts_with_padding - num_pts

  # We choose a different random seed for each process, so that the processes
  # will not all choose the same random numbers.
  process_seeds = [stat_utils.random_int32()
                   for _ in training_shards * epochs_per_cycle]
  map_args = [
      (shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf)
      for i, shard in enumerate(training_shards * epochs_per_cycle)]

  with popen_helper.get_pool(num_workers, init_worker) as pool:
    map_fn = pool.imap if deterministic else pool.imap_unordered  # pylint: disable=no-member
    data_generator = map_fn(_process_shard, map_args)
    data = [
        np.zeros(shape=(num_pts_with_padding,), dtype=np.int32) - 1,
        np.zeros(shape=(num_pts_with_padding,), dtype=np.uint16),
        np.zeros(shape=(num_pts_with_padding,), dtype=np.int8),
    ]

    # Training data is shuffled. Evaluation data MUST not be shuffled.
    # Downstream processing depends on the fact that evaluation data for a given
    # user is grouped within a batch.
    if is_training:
      index_destinations = np.random.permutation(num_pts)
      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
    else:
      index_destinations = np.arange(num_pts)

    start_ind = 0
    for data_segment in data_generator:
      n_in_segment = data_segment[0].shape[0]
      dest = index_destinations[start_ind:start_ind + n_in_segment]
      start_ind += n_in_segment
      for i in range(3):
        data[i][dest] = data_segment[i]

  assert np.sum(data[0] == -1) == num_padding

  if is_training:
    if num_padding:
      # In order to have a full batch, randomly include points from earlier in
      # the batch.

      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
      pad_sample_indices = np.random.randint(
          low=0, high=num_pts, size=(num_padding,))
      dest = np.arange(start=start_ind, stop=start_ind + num_padding)
      start_ind += num_padding
      for i in range(3):
        data[i][dest] = data[i][pad_sample_indices]
  else:
    # For Evaluation, padding is all zeros. The evaluation input_fn knows how
    # to interpret and discard the zero padded entries.
    data[0][num_pts:] = 0

  # Check that no points were overlooked.
  assert not np.sum(data[0] == -1)

  if is_training:
    # The number of points is slightly larger than num_pts due to padding.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE,
                            value=int(data[0].shape[0]))
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE,
                            value=batch_size)
  else:
    # num_pts is logged instead of int(data[0].shape[0]), because the size
    # of the data vector includes zero pads which are ignored.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts)

  batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers)
  current_file_id = -1
  current_batch_id = -1
  batches_by_file = [[] for _ in range(num_readers)]

  while True:
    current_batch_id += 1
    if (current_batch_id % batches_per_file) == 0:
      current_file_id += 1

    start_ind = current_batch_id * batch_size
    end_ind = start_ind + batch_size
    if end_ind > num_pts_with_padding:
      if start_ind != num_pts_with_padding:
        raise ValueError("Batch padding does not line up")
      break
    batches_by_file[current_file_id].append(current_batch_id)

  # Drop shards which were not assigned batches
  batches_by_file = [i for i in batches_by_file if i]
  num_readers = len(batches_by_file)

  if is_training:
    # Empirically it is observed that placing the batch with repeated values at
    # the start rather than the end improves convergence.
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
    batches_by_file[0][0], batches_by_file[-1][-1] = \
      batches_by_file[-1][-1], batches_by_file[0][0]

  if is_training:
    template = rconst.TRAIN_RECORD_TEMPLATE
    record_dir = os.path.join(cache_paths.train_epoch_dir,
                              get_cycle_folder_name(train_cycle))
    tf.gfile.MakeDirs(record_dir)
  else:
    template = rconst.EVAL_RECORD_TEMPLATE
    record_dir = cache_paths.eval_data_subdir

  batch_count = 0
  for i in range(num_readers):
    fpath = os.path.join(record_dir, template.format(i))
    log_msg("Writing {}".format(fpath))
    with tf.python_io.TFRecordWriter(fpath) as writer:
      for j in batches_by_file[i]:
        start_ind = j * batch_size
        end_ind = start_ind + batch_size
        record_kwargs = dict(
            users=data[0][start_ind:end_ind],
            items=data[1][start_ind:end_ind],
        )

        if is_training:
          record_kwargs["labels"] = data[2][start_ind:end_ind]
        else:
          record_kwargs["dupe_mask"] = stat_utils.mask_duplicates(
              record_kwargs["items"].reshape(-1, num_neg + 1),
              axis=1).flatten().astype(np.int8)

        batch_bytes = _construct_record(**record_kwargs)

        writer.write(batch_bytes)
        batch_count += 1

  # We write to a temp file then atomically rename it to the final file, because
  # writing directly to the final file can cause the main process to read a
  # partially written JSON file.
  ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
  with tf.gfile.Open(ready_file_temp, "w") as f:
    json.dump({
        "batch_size": batch_size,
        "batch_count": batch_count,
    }, f)
  ready_file = os.path.join(record_dir, rconst.READY_FILE)
  tf.gfile.Rename(ready_file_temp, ready_file)

  if is_training:
    log_msg("Cycle {} complete. Total time: {:.1f} seconds"
            .format(train_cycle, timeit.default_timer() - st))
  else:
    log_msg("Eval construction complete. Total time: {:.1f} seconds"
            .format(timeit.default_timer() - st))
Exemplo n.º 4
0
def _construct_records(
        is_training,  # type: bool
        train_cycle,  # type: typing.Optional[int]
        num_workers,  # type: int
        cache_paths,  # type: rconst.Paths
        num_readers,  # type: int
        num_neg,  # type: int
        num_positives,  # type: int
        num_items,  # type: int
        epochs_per_cycle,  # type: int
        batch_size,  # type: int
        training_shards,  # type: typing.List[str]
        deterministic=False,  # type: bool
        match_mlperf=False  # type: bool
):
    """Generate false negatives and write TFRecords files.

  Args:
    is_training: Are training records (True) or eval records (False) created.
    train_cycle: Integer of which cycle the generated data is for.
    num_workers: Number of multiprocessing workers to use for negative
      generation.
    cache_paths: Paths object with information of where to write files.
    num_readers: The number of reader datasets in the input_fn. This number is
      approximate; fewer shards will be created if not all shards are assigned
      batches. This can occur due to discretization in the assignment process.
    num_neg: The number of false negatives per positive example.
    num_positives: The number of positive examples. This value is used
      to pre-allocate arrays while the imap is still running. (NumPy does not
      allow dynamic arrays.)
    num_items: The cardinality of the item set.
    epochs_per_cycle: The number of epochs worth of data to construct.
    batch_size: The expected batch size used during training. This is used
      to properly batch data when writing TFRecords.
    training_shards: The picked positive examples from which to generate
      negatives.
  """
    st = timeit.default_timer()

    if is_training:
        mlperf_helper.ncf_print(
            key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN)
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG,
                                value=num_neg)

        # set inside _process_shard()
        mlperf_helper.ncf_print(
            key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT,
            value=True)

    else:
        # Later logic assumes that all items for a given user are in the same batch.
        assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1)
        assert num_neg == rconst.NUM_EVAL_NEGATIVES

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
                                value=num_positives)

    assert epochs_per_cycle == 1 or is_training
    num_workers = min([num_workers, len(training_shards) * epochs_per_cycle])

    num_pts = num_positives * (1 + num_neg)

    # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without
    # precision concerns
    num_pts_with_padding = (num_pts + batch_size -
                            1) // batch_size * batch_size
    num_padding = num_pts_with_padding - num_pts

    # We choose a different random seed for each process, so that the processes
    # will not all choose the same random numbers.
    process_seeds = [
        stat_utils.random_int32() for _ in training_shards * epochs_per_cycle
    ]
    map_args = [(shard, num_items, num_neg, process_seeds[i], is_training,
                 match_mlperf)
                for i, shard in enumerate(training_shards * epochs_per_cycle)]

    with popen_helper.get_pool(num_workers, init_worker) as pool:
        map_fn = pool.imap if deterministic else pool.imap_unordered  # pylint: disable=no-member
        data_generator = map_fn(_process_shard, map_args)
        data = [
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.int32) - 1,
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.uint16),
            np.zeros(shape=(num_pts_with_padding, ), dtype=np.int8),
        ]

        # Training data is shuffled. Evaluation data MUST not be shuffled.
        # Downstream processing depends on the fact that evaluation data for a given
        # user is grouped within a batch.
        if is_training:
            index_destinations = np.random.permutation(num_pts)
            mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
        else:
            index_destinations = np.arange(num_pts)

        start_ind = 0
        for data_segment in data_generator:
            n_in_segment = data_segment[0].shape[0]
            dest = index_destinations[start_ind:start_ind + n_in_segment]
            start_ind += n_in_segment
            for i in range(3):
                data[i][dest] = data_segment[i]

    assert np.sum(data[0] == -1) == num_padding

    if is_training:
        if num_padding:
            # In order to have a full batch, randomly include points from earlier in
            # the batch.

            mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
            pad_sample_indices = np.random.randint(low=0,
                                                   high=num_pts,
                                                   size=(num_padding, ))
            dest = np.arange(start=start_ind, stop=start_ind + num_padding)
            start_ind += num_padding
            for i in range(3):
                data[i][dest] = data[i][pad_sample_indices]
    else:
        # For Evaluation, padding is all zeros. The evaluation input_fn knows how
        # to interpret and discard the zero padded entries.
        data[0][num_pts:] = 0

    # Check that no points were overlooked.
    assert not np.sum(data[0] == -1)

    if is_training:
        # The number of points is slightly larger than num_pts due to padding.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE,
                                value=int(data[0].shape[0]))
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE,
                                value=batch_size)
    else:
        # num_pts is logged instead of int(data[0].shape[0]), because the size
        # of the data vector includes zero pads which are ignored.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE,
                                value=num_pts)

    batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers)
    current_file_id = -1
    current_batch_id = -1
    batches_by_file = [[] for _ in range(num_readers)]

    while True:
        current_batch_id += 1
        if (current_batch_id % batches_per_file) == 0:
            current_file_id += 1

        start_ind = current_batch_id * batch_size
        end_ind = start_ind + batch_size
        if end_ind > num_pts_with_padding:
            if start_ind != num_pts_with_padding:
                raise ValueError("Batch padding does not line up")
            break
        batches_by_file[current_file_id].append(current_batch_id)

    # Drop shards which were not assigned batches
    batches_by_file = [i for i in batches_by_file if i]
    num_readers = len(batches_by_file)

    if is_training:
        # Empirically it is observed that placing the batch with repeated values at
        # the start rather than the end improves convergence.
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER)
        batches_by_file[0][0], batches_by_file[-1][-1] = \
          batches_by_file[-1][-1], batches_by_file[0][0]

    if is_training:
        template = rconst.TRAIN_RECORD_TEMPLATE
        record_dir = os.path.join(cache_paths.train_epoch_dir,
                                  get_cycle_folder_name(train_cycle))
        tf.gfile.MakeDirs(record_dir)
    else:
        template = rconst.EVAL_RECORD_TEMPLATE
        record_dir = cache_paths.eval_data_subdir

    batch_count = 0
    for i in range(num_readers):
        fpath = os.path.join(record_dir, template.format(i))
        log_msg("Writing {}".format(fpath))
        with tf.python_io.TFRecordWriter(fpath) as writer:
            for j in batches_by_file[i]:
                start_ind = j * batch_size
                end_ind = start_ind + batch_size
                record_kwargs = dict(
                    users=data[0][start_ind:end_ind],
                    items=data[1][start_ind:end_ind],
                )

                if is_training:
                    record_kwargs["labels"] = data[2][start_ind:end_ind]
                else:
                    record_kwargs["dupe_mask"] = stat_utils.mask_duplicates(
                        record_kwargs["items"].reshape(-1, num_neg + 1),
                        axis=1).flatten().astype(np.int8)

                batch_bytes = _construct_record(**record_kwargs)

                writer.write(batch_bytes)
                batch_count += 1

    # We write to a temp file then atomically rename it to the final file, because
    # writing directly to the final file can cause the main process to read a
    # partially written JSON file.
    ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
    with tf.gfile.Open(ready_file_temp, "w") as f:
        json.dump({
            "batch_size": batch_size,
            "batch_count": batch_count,
        }, f)
    ready_file = os.path.join(record_dir, rconst.READY_FILE)
    tf.gfile.Rename(ready_file_temp, ready_file)

    if is_training:
        log_msg("Cycle {} complete. Total time: {:.1f} seconds".format(
            train_cycle,
            timeit.default_timer() - st))
    else:
        log_msg(
            "Eval construction complete. Total time: {:.1f} seconds".format(
                timeit.default_timer() - st))
Exemplo n.º 5
0
def _construct_training_records(
    train_cycle,  # type: int
    num_workers,  # type: int
    cache_paths,  # type: rconst.Paths
    num_readers,  # type: int
    num_neg,  # type: int
    num_train_positives,  # type: int
    num_items,  # type: int
    epochs_per_cycle,  # type: int
    train_batch_size,  # type: int
    training_shards,  # type: typing.List[str]
    spillover,  # type: bool
    carryover=None,  # type: typing.Union[typing.List[np.ndarray], None]
    deterministic=False  # type: bool
):
    """Generate false negatives and write TFRecords files.

  Args:
    train_cycle: Integer of which cycle the generated data is for.
    num_workers: Number of multiprocessing workers to use for negative
      generation.
    cache_paths: Paths object with information of where to write files.
    num_readers: The number of reader datasets in the train input_fn.
    num_neg: The number of false negatives per positive example.
    num_train_positives: The number of positive examples. This value is used
      to pre-allocate arrays while the imap is still running. (NumPy does not
      allow dynamic arrays.)
    num_items: The cardinality of the item set.
    epochs_per_cycle: The number of epochs worth of data to construct.
    train_batch_size: The expected batch size used during training. This is used
      to properly batch data when writing TFRecords.
    training_shards: The picked positive examples from which to generate
      negatives.
    spillover: If the final batch is incomplete, push it to the next
      cycle (True) or include a partial batch (False).
    carryover: The data points to be spilled over to the next cycle.
  """

    st = timeit.default_timer()
    num_workers = min([num_workers, len(training_shards) * epochs_per_cycle])
    carryover = carryover or [
        np.zeros((0, ), dtype=np.int32),
        np.zeros((0, ), dtype=np.uint16),
        np.zeros((0, ), dtype=np.int8),
    ]
    num_carryover = carryover[0].shape[0]
    num_pts = num_carryover + num_train_positives * (1 + num_neg)

    # We choose a different random seed for each process, so that the processes
    # will not all choose the same random numbers.
    process_seeds = [
        np.random.randint(2**32) for _ in training_shards * epochs_per_cycle
    ]
    map_args = [(shard, num_items, num_neg, process_seeds[i])
                for i, shard in enumerate(training_shards * epochs_per_cycle)]

    with popen_helper.get_pool(num_workers, init_worker) as pool:
        map_fn = pool.imap if deterministic else pool.imap_unordered  # pylint: disable=no-member
        data_generator = map_fn(_process_shard, map_args)
        data = [
            np.zeros(shape=(num_pts, ), dtype=np.int32) - 1,
            np.zeros(shape=(num_pts, ), dtype=np.uint16),
            np.zeros(shape=(num_pts, ), dtype=np.int8),
        ]

        # The carryover data is always first.
        for i in range(3):
            data[i][:num_carryover] = carryover[i]
        index_destinations = np.random.permutation(
            num_train_positives * (1 + num_neg)) + num_carryover
        start_ind = 0
        for data_segment in data_generator:
            n_in_segment = data_segment[0].shape[0]
            dest = index_destinations[start_ind:start_ind + n_in_segment]
            start_ind += n_in_segment
            for i in range(3):
                data[i][dest] = data_segment[i]

    # Check that no points were dropped.
    assert (num_pts - num_carryover) == start_ind
    assert not np.sum(data[0] == -1)

    record_dir = os.path.join(cache_paths.train_epoch_dir,
                              get_cycle_folder_name(train_cycle))
    tf.gfile.MakeDirs(record_dir)

    batches_per_file = np.ceil(num_pts / train_batch_size / num_readers)
    current_file_id = -1
    current_batch_id = -1
    batches_by_file = [[] for _ in range(num_readers)]

    output_carryover = [
        np.zeros(shape=(0, ), dtype=np.int32),
        np.zeros(shape=(0, ), dtype=np.uint16),
        np.zeros(shape=(0, ), dtype=np.int8),
    ]

    while True:
        current_batch_id += 1
        if (current_batch_id % batches_per_file) == 0:
            current_file_id += 1
        end_ind = (current_batch_id + 1) * train_batch_size
        if end_ind > num_pts:
            if spillover:
                output_carryover = [
                    data[i][current_batch_id * train_batch_size:num_pts]
                    for i in range(3)
                ]
                break
            else:
                batches_by_file[current_file_id].append(current_batch_id)
                break
        batches_by_file[current_file_id].append(current_batch_id)

    batch_count = 0
    for i in range(num_readers):
        fpath = os.path.join(record_dir,
                             rconst.TRAIN_RECORD_TEMPLATE.format(i))
        log_msg("Writing {}".format(fpath))
        with tf.python_io.TFRecordWriter(fpath) as writer:
            for j in batches_by_file[i]:
                start_ind = j * train_batch_size
                end_ind = start_ind + train_batch_size
                batch_bytes = _construct_record(
                    users=data[0][start_ind:end_ind],
                    items=data[1][start_ind:end_ind],
                    labels=data[2][start_ind:end_ind],
                )

                writer.write(batch_bytes)
                batch_count += 1

    if spillover:
        written_pts = output_carryover[0].shape[
            0] + batch_count * train_batch_size
        if num_pts != written_pts:
            raise ValueError(
                "Error detected: point counts do not match: {} vs. {}".format(
                    num_pts, written_pts))

    # We write to a temp file then atomically rename it to the final file, because
    # writing directly to the final file can cause the main process to read a
    # partially written JSON file.
    ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
    with tf.gfile.Open(ready_file_temp, "w") as f:
        json.dump(
            {
                "batch_size": train_batch_size,
                "batch_count": batch_count,
            }, f)
    ready_file = os.path.join(record_dir, rconst.READY_FILE)
    tf.gfile.Rename(ready_file_temp, ready_file)

    log_msg("Cycle {} complete. Total time: {:.1f} seconds".format(
        train_cycle,
        timeit.default_timer() - st))

    return output_carryover