Exemplo n.º 1
0
def _process_shard(args):
    # type: ((str, int, int, int)) -> (np.ndarray, np.ndarray, np.ndarray)
    """Read a shard of training data and return training vectors.

  Args:
    shard_path: The filepath of the positive instance training shard.
    num_items: The cardinality of the item set.
    num_neg: The number of negatives to generate per positive example.
    seed: Random seed to be used when generating negatives.
  """
    shard_path, num_items, num_neg, seed = args
    np.random.seed(seed)

    # The choice to store the training shards in files rather than in memory
    # is motivated by the fact that multiprocessing serializes arguments,
    # transmits them to map workers, and then deserializes them. By storing the
    # training shards in files, the serialization work only needs to be done once.
    #
    # A similar effect could be achieved by simply holding pickled bytes in
    # memory, however the processing is not I/O bound and is therefore
    # unnecessary.
    with tf.gfile.Open(shard_path, "rb") as f:
        shard = pickle.load(f)

    users = shard[movielens.USER_COLUMN]
    items = shard[movielens.ITEM_COLUMN]

    delta = users[1:] - users[:-1]
    boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() +
                  [users.shape[0]])

    user_blocks = []
    item_blocks = []
    label_blocks = []
    for i in range(len(boundaries) - 1):
        assert len(set(users[boundaries[i]:boundaries[i + 1]])) == 1
        positive_items = items[boundaries[i]:boundaries[i + 1]]
        positive_set = set(positive_items)
        if positive_items.shape[0] != len(positive_set):
            raise ValueError("Duplicate entries detected.")
        n_pos = len(positive_set)

        negatives = stat_utils.sample_with_exclusion(num_items, positive_set,
                                                     n_pos * num_neg)

        user_blocks.append(users[boundaries[i]] * np.ones(
            (n_pos * (1 + num_neg), ), dtype=np.int32))
        item_blocks.append(
            np.array(list(positive_set) + negatives, dtype=np.uint16))
        labels_for_user = np.zeros((n_pos * (1 + num_neg), ), dtype=np.int8)
        labels_for_user[:n_pos] = 1
        label_blocks.append(labels_for_user)

    users_out = np.concatenate(user_blocks)
    items_out = np.concatenate(item_blocks)
    labels_out = np.concatenate(label_blocks)

    assert users_out.shape == items_out.shape == labels_out.shape
    return users_out, items_out, labels_out
Exemplo n.º 2
0
def _process_shard(args):
  # type: ((str, int, int, int)) -> (np.ndarray, np.ndarray, np.ndarray)
  """Read a shard of training data and return training vectors.

  Args:
    shard_path: The filepath of the positive instance training shard.
    num_items: The cardinality of the item set.
    num_neg: The number of negatives to generate per positive example.
    seed: Random seed to be used when generating negatives.
  """
  shard_path, num_items, num_neg, seed = args
  np.random.seed(seed)

  # The choice to store the training shards in files rather than in memory
  # is motivated by the fact that multiprocessing serializes arguments,
  # transmits them to map workers, and then deserializes them. By storing the
  # training shards in files, the serialization work only needs to be done once.
  #
  # A similar effect could be achieved by simply holding pickled bytes in
  # memory, however the processing is not I/O bound and is therefore
  # unnecessary.
  with tf.gfile.Open(shard_path, "rb") as f:
    shard = pickle.load(f)

  users = shard[movielens.USER_COLUMN]
  items = shard[movielens.ITEM_COLUMN]

  delta = users[1:] - users[:-1]
  boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() +
                [users.shape[0]])

  user_blocks = []
  item_blocks = []
  label_blocks = []
  for i in range(len(boundaries) - 1):
    assert len(set(users[boundaries[i]:boundaries[i+1]])) == 1
    positive_items = items[boundaries[i]:boundaries[i+1]]
    positive_set = set(positive_items)
    if positive_items.shape[0] != len(positive_set):
      raise ValueError("Duplicate entries detected.")
    n_pos = len(positive_set)

    negatives = stat_utils.sample_with_exclusion(
        num_items, positive_set, n_pos * num_neg)

    user_blocks.append(users[boundaries[i]] * np.ones(
        (n_pos * (1 + num_neg),), dtype=np.int32))
    item_blocks.append(
        np.array(list(positive_set) + negatives, dtype=np.uint16))
    labels_for_user = np.zeros((n_pos * (1 + num_neg),), dtype=np.int8)
    labels_for_user[:n_pos] = 1
    label_blocks.append(labels_for_user)

  users_out = np.concatenate(user_blocks)
  items_out = np.concatenate(item_blocks)
  labels_out = np.concatenate(label_blocks)

  assert users_out.shape == items_out.shape == labels_out.shape
  return users_out, items_out, labels_out
Exemplo n.º 3
0
def _train_eval_map_fn(args):
    # type: (...) -> typing.Dict(np.ndarray)
    """Split training and testing data and generate testing negatives.

  This function is called as part of a multiprocessing map. The principle
  input is a shard, which contains a sorted array of users and corresponding
  items for each user, where items have already been sorted in ascending order
  by timestamp. (Timestamp is not passed to avoid the serialization cost of
  sending it to the map function.)

  For each user, all but the last item is written into a pickle file which the
  training data producer can consume on as needed. The last item for a user
  is a validation point; for each validation point a number of negatives are
  generated (typically 999). The validation data is returned by this function,
  as it is held in memory for the remainder of the run.

  Args:
    shard: A dict containing the user and item arrays.
    shard_id: The id of the shard provided. This is used to number the training
      shard pickle files.
    num_items: The cardinality of the item set, which determines the set from
      which validation negatives should be drawn.
    cache_paths: rconst.Paths object containing locations for various cache
      files.
    seed: Random seed to be used when generating testing negatives.
    match_mlperf: If True, sample eval negative with replacements, which the
      MLPerf reference implementation does.

  Returns:
    A dict containing the evaluation data for a given shard.
  """

    shard, shard_id, num_items, cache_paths, seed, match_mlperf = args
    np.random.seed(seed)

    users = shard[movielens.USER_COLUMN]
    items = shard[movielens.ITEM_COLUMN]

    # This produces index boundaries which can be used to slice by user.
    delta = users[1:] - users[:-1]
    boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() +
                  [users.shape[0]])

    train_blocks = []
    test_blocks = []
    test_positives = []
    for i in range(len(boundaries) - 1):
        # This is simply a vector of repeated values such that the shard could be
        # represented compactly with a tuple of tuples:
        #   ((user_id, items), (user_id, items), ...)
        # rather than:
        #   user_id_vector, item_id_vector
        # However the additional nested structure significantly increases the
        # serialization and deserialization cost such that it is not worthwhile.
        block_user = users[boundaries[i]:boundaries[i + 1]]
        assert len(set(block_user)) == 1

        block_items = items[boundaries[i]:boundaries[i + 1]]
        train_blocks.append((block_user[:-1], block_items[:-1]))

        test_negatives = stat_utils.sample_with_exclusion(
            num_items=num_items,
            positive_set=set(block_items),
            n=rconst.NUM_EVAL_NEGATIVES,
            replacement=match_mlperf)
        test_blocks.append((block_user[0] * np.ones(
            (rconst.NUM_EVAL_NEGATIVES + 1, ), dtype=np.int32),
                            np.array([block_items[-1]] + test_negatives,
                                     dtype=np.uint16)))
        test_positives.append((block_user[0], block_items[-1]))

    train_users = np.concatenate([i[0] for i in train_blocks])
    train_items = np.concatenate([i[1] for i in train_blocks])

    train_shard_fpath = cache_paths.train_shard_template.format(
        str(shard_id).zfill(5))

    with tf.gfile.Open(train_shard_fpath, "wb") as f:
        pickle.dump(
            {
                movielens.USER_COLUMN: train_users,
                movielens.ITEM_COLUMN: train_items,
            }, f)

    test_users = np.concatenate([i[0] for i in test_blocks])
    test_items = np.concatenate([i[1] for i in test_blocks])
    assert test_users.shape == test_items.shape
    assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0

    return {
        movielens.USER_COLUMN: test_users,
        movielens.ITEM_COLUMN: test_items,
    }
Exemplo n.º 4
0
def _process_shard(args):
    # type: ((str, int, int, int, bool)) -> (np.ndarray, np.ndarray, np.ndarray)
    """Read a shard of training data and return training vectors.

  Args:
    shard_path: The filepath of the positive instance training shard.
    num_items: The cardinality of the item set.
    num_neg: The number of negatives to generate per positive example.
    seed: Random seed to be used when generating negatives.
    is_training: Generate training (True) or eval (False) data.
    match_mlperf: Match the MLPerf reference behavior
  """
    shard_path, num_items, num_neg, seed, is_training, match_mlperf = args
    np.random.seed(seed)

    # The choice to store the training shards in files rather than in memory
    # is motivated by the fact that multiprocessing serializes arguments,
    # transmits them to map workers, and then deserializes them. By storing the
    # training shards in files, the serialization work only needs to be done once.
    #
    # A similar effect could be achieved by simply holding pickled bytes in
    # memory, however the processing is not I/O bound and is therefore
    # unnecessary.
    with tf.gfile.Open(shard_path, "rb") as f:
        shard = pickle.load(f)

    users = shard[rconst.TRAIN_KEY][movielens.USER_COLUMN]
    items = shard[rconst.TRAIN_KEY][movielens.ITEM_COLUMN]

    if not is_training:
        # For eval, there is one positive which was held out from the training set.
        test_positive_dict = dict(
            zip(shard[rconst.EVAL_KEY][movielens.USER_COLUMN],
                shard[rconst.EVAL_KEY][movielens.ITEM_COLUMN]))

    delta = users[1:] - users[:-1]
    boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() +
                  [users.shape[0]])

    user_blocks = []
    item_blocks = []
    label_blocks = []
    for i in range(len(boundaries) - 1):
        assert len(set(users[boundaries[i]:boundaries[i + 1]])) == 1
        current_user = users[boundaries[i]]

        positive_items = items[boundaries[i]:boundaries[i + 1]]
        positive_set = set(positive_items)
        if positive_items.shape[0] != len(positive_set):
            raise ValueError("Duplicate entries detected.")

        if is_training:
            n_pos = len(positive_set)
            negatives = stat_utils.sample_with_exclusion(num_items,
                                                         positive_set,
                                                         n_pos * num_neg,
                                                         replacement=True)

        else:
            if not match_mlperf:
                # The mlperf reference allows the holdout item to appear as a negative.
                # Including it in the positive set makes the eval more stringent,
                # because an appearance of the test item would be removed by
                # deduplication rules. (Effectively resulting in a minute reduction of
                # NUM_EVAL_NEGATIVES)
                positive_set.add(test_positive_dict[current_user])

            negatives = stat_utils.sample_with_exclusion(
                num_items, positive_set, num_neg, replacement=match_mlperf)
            positive_set = [test_positive_dict[current_user]]
            n_pos = len(positive_set)
            assert n_pos == 1

        user_blocks.append(current_user * np.ones(
            (n_pos * (1 + num_neg), ), dtype=np.int32))
        item_blocks.append(
            np.array(list(positive_set) + negatives, dtype=np.uint16))
        labels_for_user = np.zeros((n_pos * (1 + num_neg), ), dtype=np.int8)
        labels_for_user[:n_pos] = 1
        label_blocks.append(labels_for_user)

    users_out = np.concatenate(user_blocks)
    items_out = np.concatenate(item_blocks)
    labels_out = np.concatenate(label_blocks)

    assert users_out.shape == items_out.shape == labels_out.shape
    return users_out, items_out, labels_out
Exemplo n.º 5
0
def _process_shard(args):
  # type: ((str, int, int, int, bool)) -> (np.ndarray, np.ndarray, np.ndarray)
  """Read a shard of training data and return training vectors.

  Args:
    shard_path: The filepath of the positive instance training shard.
    num_items: The cardinality of the item set.
    num_neg: The number of negatives to generate per positive example.
    seed: Random seed to be used when generating negatives.
    is_training: Generate training (True) or eval (False) data.
    match_mlperf: Match the MLPerf reference behavior
  """
  shard_path, num_items, num_neg, seed, is_training, match_mlperf = args
  np.random.seed(seed)

  # The choice to store the training shards in files rather than in memory
  # is motivated by the fact that multiprocessing serializes arguments,
  # transmits them to map workers, and then deserializes them. By storing the
  # training shards in files, the serialization work only needs to be done once.
  #
  # A similar effect could be achieved by simply holding pickled bytes in
  # memory, however the processing is not I/O bound and is therefore
  # unnecessary.
  with tf.gfile.Open(shard_path, "rb") as f:
    shard = pickle.load(f)

  users = shard[rconst.TRAIN_KEY][movielens.USER_COLUMN]
  items = shard[rconst.TRAIN_KEY][movielens.ITEM_COLUMN]

  if not is_training:
    # For eval, there is one positive which was held out from the training set.
    test_positive_dict = dict(zip(
        shard[rconst.EVAL_KEY][movielens.USER_COLUMN],
        shard[rconst.EVAL_KEY][movielens.ITEM_COLUMN]))

  delta = users[1:] - users[:-1]
  boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() +
                [users.shape[0]])

  user_blocks = []
  item_blocks = []
  label_blocks = []
  for i in range(len(boundaries) - 1):
    assert len(set(users[boundaries[i]:boundaries[i+1]])) == 1
    current_user = users[boundaries[i]]

    positive_items = items[boundaries[i]:boundaries[i+1]]
    positive_set = set(positive_items)
    if positive_items.shape[0] != len(positive_set):
      raise ValueError("Duplicate entries detected.")

    if is_training:
      n_pos = len(positive_set)
      negatives = stat_utils.sample_with_exclusion(
          num_items, positive_set, n_pos * num_neg, replacement=True)

    else:
      if not match_mlperf:
        # The mlperf reference allows the holdout item to appear as a negative.
        # Including it in the positive set makes the eval more stringent,
        # because an appearance of the test item would be removed by
        # deduplication rules. (Effectively resulting in a minute reduction of
        # NUM_EVAL_NEGATIVES)
        positive_set.add(test_positive_dict[current_user])

      negatives = stat_utils.sample_with_exclusion(
          num_items, positive_set, num_neg, replacement=match_mlperf)
      positive_set = [test_positive_dict[current_user]]
      n_pos = len(positive_set)
      assert n_pos == 1

    user_blocks.append(current_user * np.ones(
        (n_pos * (1 + num_neg),), dtype=np.int32))
    item_blocks.append(
        np.array(list(positive_set) + negatives, dtype=np.uint16))
    labels_for_user = np.zeros((n_pos * (1 + num_neg),), dtype=np.int8)
    labels_for_user[:n_pos] = 1
    label_blocks.append(labels_for_user)

  users_out = np.concatenate(user_blocks)
  items_out = np.concatenate(item_blocks)
  labels_out = np.concatenate(label_blocks)

  assert users_out.shape == items_out.shape == labels_out.shape
  return users_out, items_out, labels_out
Exemplo n.º 6
0
def _train_eval_map_fn(args):
  # type: (...) -> typing.Dict(np.ndarray)
  """Split training and testing data and generate testing negatives.

  This function is called as part of a multiprocessing map. The principle
  input is a shard, which contains a sorted array of users and corresponding
  items for each user, where items have already been sorted in ascending order
  by timestamp. (Timestamp is not passed to avoid the serialization cost of
  sending it to the map function.)

  For each user, all but the last item is written into a pickle file which the
  training data producer can consume on as needed. The last item for a user
  is a validation point; for each validation point a number of negatives are
  generated (typically 999). The validation data is returned by this function,
  as it is held in memory for the remainder of the run.

  Args:
    shard: A dict containing the user and item arrays.
    shard_id: The id of the shard provided. This is used to number the training
      shard pickle files.
    num_items: The cardinality of the item set, which determines the set from
      which validation negatives should be drawn.
    cache_paths: rconst.Paths object containing locations for various cache
      files.
    seed: Random seed to be used when generating testing negatives.
    match_mlperf: If True, sample eval negative with replacements, which the
      MLPerf reference implementation does.

  Returns:
    A dict containing the evaluation data for a given shard.
  """

  shard, shard_id, num_items, cache_paths, seed, match_mlperf = args
  np.random.seed(seed)

  users = shard[movielens.USER_COLUMN]
  items = shard[movielens.ITEM_COLUMN]

  # This produces index boundaries which can be used to slice by user.
  delta = users[1:] - users[:-1]
  boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() +
                [users.shape[0]])

  train_blocks = []
  test_blocks = []
  test_positives = []
  for i in range(len(boundaries) - 1):
    # This is simply a vector of repeated values such that the shard could be
    # represented compactly with a tuple of tuples:
    #   ((user_id, items), (user_id, items), ...)
    # rather than:
    #   user_id_vector, item_id_vector
    # However the additional nested structure significantly increases the
    # serialization and deserialization cost such that it is not worthwhile.
    block_user = users[boundaries[i]:boundaries[i+1]]
    assert len(set(block_user)) == 1

    block_items = items[boundaries[i]:boundaries[i+1]]
    train_blocks.append((block_user[:-1], block_items[:-1]))

    test_negatives = stat_utils.sample_with_exclusion(
        num_items=num_items, positive_set=set(block_items),
        n=rconst.NUM_EVAL_NEGATIVES, replacement=match_mlperf)
    test_blocks.append((
        block_user[0] * np.ones((rconst.NUM_EVAL_NEGATIVES + 1,),
                                dtype=np.int32),
        np.array([block_items[-1]] + test_negatives, dtype=np.uint16)
    ))
    test_positives.append((block_user[0], block_items[-1]))

  train_users = np.concatenate([i[0] for i in train_blocks])
  train_items = np.concatenate([i[1] for i in train_blocks])

  train_shard_fpath = cache_paths.train_shard_template.format(
      str(shard_id).zfill(5))

  with tf.gfile.Open(train_shard_fpath, "wb") as f:
    pickle.dump({
        movielens.USER_COLUMN: train_users,
        movielens.ITEM_COLUMN: train_items,
    }, f)

  test_users = np.concatenate([i[0] for i in test_blocks])
  test_items = np.concatenate([i[1] for i in test_blocks])
  assert test_users.shape == test_items.shape
  assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0

  return {
      movielens.USER_COLUMN: test_users,
      movielens.ITEM_COLUMN: test_items,
  }