def _process_shard(args): # type: ((str, int, int, int)) -> (np.ndarray, np.ndarray, np.ndarray) """Read a shard of training data and return training vectors. Args: shard_path: The filepath of the positive instance training shard. num_items: The cardinality of the item set. num_neg: The number of negatives to generate per positive example. seed: Random seed to be used when generating negatives. """ shard_path, num_items, num_neg, seed = args np.random.seed(seed) # The choice to store the training shards in files rather than in memory # is motivated by the fact that multiprocessing serializes arguments, # transmits them to map workers, and then deserializes them. By storing the # training shards in files, the serialization work only needs to be done once. # # A similar effect could be achieved by simply holding pickled bytes in # memory, however the processing is not I/O bound and is therefore # unnecessary. with tf.gfile.Open(shard_path, "rb") as f: shard = pickle.load(f) users = shard[movielens.USER_COLUMN] items = shard[movielens.ITEM_COLUMN] delta = users[1:] - users[:-1] boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() + [users.shape[0]]) user_blocks = [] item_blocks = [] label_blocks = [] for i in range(len(boundaries) - 1): assert len(set(users[boundaries[i]:boundaries[i + 1]])) == 1 positive_items = items[boundaries[i]:boundaries[i + 1]] positive_set = set(positive_items) if positive_items.shape[0] != len(positive_set): raise ValueError("Duplicate entries detected.") n_pos = len(positive_set) negatives = stat_utils.sample_with_exclusion(num_items, positive_set, n_pos * num_neg) user_blocks.append(users[boundaries[i]] * np.ones( (n_pos * (1 + num_neg), ), dtype=np.int32)) item_blocks.append( np.array(list(positive_set) + negatives, dtype=np.uint16)) labels_for_user = np.zeros((n_pos * (1 + num_neg), ), dtype=np.int8) labels_for_user[:n_pos] = 1 label_blocks.append(labels_for_user) users_out = np.concatenate(user_blocks) items_out = np.concatenate(item_blocks) labels_out = np.concatenate(label_blocks) assert users_out.shape == items_out.shape == labels_out.shape return users_out, items_out, labels_out
def _process_shard(args): # type: ((str, int, int, int)) -> (np.ndarray, np.ndarray, np.ndarray) """Read a shard of training data and return training vectors. Args: shard_path: The filepath of the positive instance training shard. num_items: The cardinality of the item set. num_neg: The number of negatives to generate per positive example. seed: Random seed to be used when generating negatives. """ shard_path, num_items, num_neg, seed = args np.random.seed(seed) # The choice to store the training shards in files rather than in memory # is motivated by the fact that multiprocessing serializes arguments, # transmits them to map workers, and then deserializes them. By storing the # training shards in files, the serialization work only needs to be done once. # # A similar effect could be achieved by simply holding pickled bytes in # memory, however the processing is not I/O bound and is therefore # unnecessary. with tf.gfile.Open(shard_path, "rb") as f: shard = pickle.load(f) users = shard[movielens.USER_COLUMN] items = shard[movielens.ITEM_COLUMN] delta = users[1:] - users[:-1] boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() + [users.shape[0]]) user_blocks = [] item_blocks = [] label_blocks = [] for i in range(len(boundaries) - 1): assert len(set(users[boundaries[i]:boundaries[i+1]])) == 1 positive_items = items[boundaries[i]:boundaries[i+1]] positive_set = set(positive_items) if positive_items.shape[0] != len(positive_set): raise ValueError("Duplicate entries detected.") n_pos = len(positive_set) negatives = stat_utils.sample_with_exclusion( num_items, positive_set, n_pos * num_neg) user_blocks.append(users[boundaries[i]] * np.ones( (n_pos * (1 + num_neg),), dtype=np.int32)) item_blocks.append( np.array(list(positive_set) + negatives, dtype=np.uint16)) labels_for_user = np.zeros((n_pos * (1 + num_neg),), dtype=np.int8) labels_for_user[:n_pos] = 1 label_blocks.append(labels_for_user) users_out = np.concatenate(user_blocks) items_out = np.concatenate(item_blocks) labels_out = np.concatenate(label_blocks) assert users_out.shape == items_out.shape == labels_out.shape return users_out, items_out, labels_out
def _train_eval_map_fn(args): # type: (...) -> typing.Dict(np.ndarray) """Split training and testing data and generate testing negatives. This function is called as part of a multiprocessing map. The principle input is a shard, which contains a sorted array of users and corresponding items for each user, where items have already been sorted in ascending order by timestamp. (Timestamp is not passed to avoid the serialization cost of sending it to the map function.) For each user, all but the last item is written into a pickle file which the training data producer can consume on as needed. The last item for a user is a validation point; for each validation point a number of negatives are generated (typically 999). The validation data is returned by this function, as it is held in memory for the remainder of the run. Args: shard: A dict containing the user and item arrays. shard_id: The id of the shard provided. This is used to number the training shard pickle files. num_items: The cardinality of the item set, which determines the set from which validation negatives should be drawn. cache_paths: rconst.Paths object containing locations for various cache files. seed: Random seed to be used when generating testing negatives. match_mlperf: If True, sample eval negative with replacements, which the MLPerf reference implementation does. Returns: A dict containing the evaluation data for a given shard. """ shard, shard_id, num_items, cache_paths, seed, match_mlperf = args np.random.seed(seed) users = shard[movielens.USER_COLUMN] items = shard[movielens.ITEM_COLUMN] # This produces index boundaries which can be used to slice by user. delta = users[1:] - users[:-1] boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() + [users.shape[0]]) train_blocks = [] test_blocks = [] test_positives = [] for i in range(len(boundaries) - 1): # This is simply a vector of repeated values such that the shard could be # represented compactly with a tuple of tuples: # ((user_id, items), (user_id, items), ...) # rather than: # user_id_vector, item_id_vector # However the additional nested structure significantly increases the # serialization and deserialization cost such that it is not worthwhile. block_user = users[boundaries[i]:boundaries[i + 1]] assert len(set(block_user)) == 1 block_items = items[boundaries[i]:boundaries[i + 1]] train_blocks.append((block_user[:-1], block_items[:-1])) test_negatives = stat_utils.sample_with_exclusion( num_items=num_items, positive_set=set(block_items), n=rconst.NUM_EVAL_NEGATIVES, replacement=match_mlperf) test_blocks.append((block_user[0] * np.ones( (rconst.NUM_EVAL_NEGATIVES + 1, ), dtype=np.int32), np.array([block_items[-1]] + test_negatives, dtype=np.uint16))) test_positives.append((block_user[0], block_items[-1])) train_users = np.concatenate([i[0] for i in train_blocks]) train_items = np.concatenate([i[1] for i in train_blocks]) train_shard_fpath = cache_paths.train_shard_template.format( str(shard_id).zfill(5)) with tf.gfile.Open(train_shard_fpath, "wb") as f: pickle.dump( { movielens.USER_COLUMN: train_users, movielens.ITEM_COLUMN: train_items, }, f) test_users = np.concatenate([i[0] for i in test_blocks]) test_items = np.concatenate([i[1] for i in test_blocks]) assert test_users.shape == test_items.shape assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0 return { movielens.USER_COLUMN: test_users, movielens.ITEM_COLUMN: test_items, }
def _process_shard(args): # type: ((str, int, int, int, bool)) -> (np.ndarray, np.ndarray, np.ndarray) """Read a shard of training data and return training vectors. Args: shard_path: The filepath of the positive instance training shard. num_items: The cardinality of the item set. num_neg: The number of negatives to generate per positive example. seed: Random seed to be used when generating negatives. is_training: Generate training (True) or eval (False) data. match_mlperf: Match the MLPerf reference behavior """ shard_path, num_items, num_neg, seed, is_training, match_mlperf = args np.random.seed(seed) # The choice to store the training shards in files rather than in memory # is motivated by the fact that multiprocessing serializes arguments, # transmits them to map workers, and then deserializes them. By storing the # training shards in files, the serialization work only needs to be done once. # # A similar effect could be achieved by simply holding pickled bytes in # memory, however the processing is not I/O bound and is therefore # unnecessary. with tf.gfile.Open(shard_path, "rb") as f: shard = pickle.load(f) users = shard[rconst.TRAIN_KEY][movielens.USER_COLUMN] items = shard[rconst.TRAIN_KEY][movielens.ITEM_COLUMN] if not is_training: # For eval, there is one positive which was held out from the training set. test_positive_dict = dict( zip(shard[rconst.EVAL_KEY][movielens.USER_COLUMN], shard[rconst.EVAL_KEY][movielens.ITEM_COLUMN])) delta = users[1:] - users[:-1] boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() + [users.shape[0]]) user_blocks = [] item_blocks = [] label_blocks = [] for i in range(len(boundaries) - 1): assert len(set(users[boundaries[i]:boundaries[i + 1]])) == 1 current_user = users[boundaries[i]] positive_items = items[boundaries[i]:boundaries[i + 1]] positive_set = set(positive_items) if positive_items.shape[0] != len(positive_set): raise ValueError("Duplicate entries detected.") if is_training: n_pos = len(positive_set) negatives = stat_utils.sample_with_exclusion(num_items, positive_set, n_pos * num_neg, replacement=True) else: if not match_mlperf: # The mlperf reference allows the holdout item to appear as a negative. # Including it in the positive set makes the eval more stringent, # because an appearance of the test item would be removed by # deduplication rules. (Effectively resulting in a minute reduction of # NUM_EVAL_NEGATIVES) positive_set.add(test_positive_dict[current_user]) negatives = stat_utils.sample_with_exclusion( num_items, positive_set, num_neg, replacement=match_mlperf) positive_set = [test_positive_dict[current_user]] n_pos = len(positive_set) assert n_pos == 1 user_blocks.append(current_user * np.ones( (n_pos * (1 + num_neg), ), dtype=np.int32)) item_blocks.append( np.array(list(positive_set) + negatives, dtype=np.uint16)) labels_for_user = np.zeros((n_pos * (1 + num_neg), ), dtype=np.int8) labels_for_user[:n_pos] = 1 label_blocks.append(labels_for_user) users_out = np.concatenate(user_blocks) items_out = np.concatenate(item_blocks) labels_out = np.concatenate(label_blocks) assert users_out.shape == items_out.shape == labels_out.shape return users_out, items_out, labels_out
def _process_shard(args): # type: ((str, int, int, int, bool)) -> (np.ndarray, np.ndarray, np.ndarray) """Read a shard of training data and return training vectors. Args: shard_path: The filepath of the positive instance training shard. num_items: The cardinality of the item set. num_neg: The number of negatives to generate per positive example. seed: Random seed to be used when generating negatives. is_training: Generate training (True) or eval (False) data. match_mlperf: Match the MLPerf reference behavior """ shard_path, num_items, num_neg, seed, is_training, match_mlperf = args np.random.seed(seed) # The choice to store the training shards in files rather than in memory # is motivated by the fact that multiprocessing serializes arguments, # transmits them to map workers, and then deserializes them. By storing the # training shards in files, the serialization work only needs to be done once. # # A similar effect could be achieved by simply holding pickled bytes in # memory, however the processing is not I/O bound and is therefore # unnecessary. with tf.gfile.Open(shard_path, "rb") as f: shard = pickle.load(f) users = shard[rconst.TRAIN_KEY][movielens.USER_COLUMN] items = shard[rconst.TRAIN_KEY][movielens.ITEM_COLUMN] if not is_training: # For eval, there is one positive which was held out from the training set. test_positive_dict = dict(zip( shard[rconst.EVAL_KEY][movielens.USER_COLUMN], shard[rconst.EVAL_KEY][movielens.ITEM_COLUMN])) delta = users[1:] - users[:-1] boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() + [users.shape[0]]) user_blocks = [] item_blocks = [] label_blocks = [] for i in range(len(boundaries) - 1): assert len(set(users[boundaries[i]:boundaries[i+1]])) == 1 current_user = users[boundaries[i]] positive_items = items[boundaries[i]:boundaries[i+1]] positive_set = set(positive_items) if positive_items.shape[0] != len(positive_set): raise ValueError("Duplicate entries detected.") if is_training: n_pos = len(positive_set) negatives = stat_utils.sample_with_exclusion( num_items, positive_set, n_pos * num_neg, replacement=True) else: if not match_mlperf: # The mlperf reference allows the holdout item to appear as a negative. # Including it in the positive set makes the eval more stringent, # because an appearance of the test item would be removed by # deduplication rules. (Effectively resulting in a minute reduction of # NUM_EVAL_NEGATIVES) positive_set.add(test_positive_dict[current_user]) negatives = stat_utils.sample_with_exclusion( num_items, positive_set, num_neg, replacement=match_mlperf) positive_set = [test_positive_dict[current_user]] n_pos = len(positive_set) assert n_pos == 1 user_blocks.append(current_user * np.ones( (n_pos * (1 + num_neg),), dtype=np.int32)) item_blocks.append( np.array(list(positive_set) + negatives, dtype=np.uint16)) labels_for_user = np.zeros((n_pos * (1 + num_neg),), dtype=np.int8) labels_for_user[:n_pos] = 1 label_blocks.append(labels_for_user) users_out = np.concatenate(user_blocks) items_out = np.concatenate(item_blocks) labels_out = np.concatenate(label_blocks) assert users_out.shape == items_out.shape == labels_out.shape return users_out, items_out, labels_out
def _train_eval_map_fn(args): # type: (...) -> typing.Dict(np.ndarray) """Split training and testing data and generate testing negatives. This function is called as part of a multiprocessing map. The principle input is a shard, which contains a sorted array of users and corresponding items for each user, where items have already been sorted in ascending order by timestamp. (Timestamp is not passed to avoid the serialization cost of sending it to the map function.) For each user, all but the last item is written into a pickle file which the training data producer can consume on as needed. The last item for a user is a validation point; for each validation point a number of negatives are generated (typically 999). The validation data is returned by this function, as it is held in memory for the remainder of the run. Args: shard: A dict containing the user and item arrays. shard_id: The id of the shard provided. This is used to number the training shard pickle files. num_items: The cardinality of the item set, which determines the set from which validation negatives should be drawn. cache_paths: rconst.Paths object containing locations for various cache files. seed: Random seed to be used when generating testing negatives. match_mlperf: If True, sample eval negative with replacements, which the MLPerf reference implementation does. Returns: A dict containing the evaluation data for a given shard. """ shard, shard_id, num_items, cache_paths, seed, match_mlperf = args np.random.seed(seed) users = shard[movielens.USER_COLUMN] items = shard[movielens.ITEM_COLUMN] # This produces index boundaries which can be used to slice by user. delta = users[1:] - users[:-1] boundaries = ([0] + (np.argwhere(delta)[:, 0] + 1).tolist() + [users.shape[0]]) train_blocks = [] test_blocks = [] test_positives = [] for i in range(len(boundaries) - 1): # This is simply a vector of repeated values such that the shard could be # represented compactly with a tuple of tuples: # ((user_id, items), (user_id, items), ...) # rather than: # user_id_vector, item_id_vector # However the additional nested structure significantly increases the # serialization and deserialization cost such that it is not worthwhile. block_user = users[boundaries[i]:boundaries[i+1]] assert len(set(block_user)) == 1 block_items = items[boundaries[i]:boundaries[i+1]] train_blocks.append((block_user[:-1], block_items[:-1])) test_negatives = stat_utils.sample_with_exclusion( num_items=num_items, positive_set=set(block_items), n=rconst.NUM_EVAL_NEGATIVES, replacement=match_mlperf) test_blocks.append(( block_user[0] * np.ones((rconst.NUM_EVAL_NEGATIVES + 1,), dtype=np.int32), np.array([block_items[-1]] + test_negatives, dtype=np.uint16) )) test_positives.append((block_user[0], block_items[-1])) train_users = np.concatenate([i[0] for i in train_blocks]) train_items = np.concatenate([i[1] for i in train_blocks]) train_shard_fpath = cache_paths.train_shard_template.format( str(shard_id).zfill(5)) with tf.gfile.Open(train_shard_fpath, "wb") as f: pickle.dump({ movielens.USER_COLUMN: train_users, movielens.ITEM_COLUMN: train_items, }, f) test_users = np.concatenate([i[0] for i in test_blocks]) test_items = np.concatenate([i[1] for i in test_blocks]) assert test_users.shape == test_items.shape assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0 return { movielens.USER_COLUMN: test_users, movielens.ITEM_COLUMN: test_items, }