def split_train_test_userwise_random( df_: pd.DataFrame, user_colname: str, item_colname: str, item_ids: List[Any], heldout_ratio: float, n_heldout: Optional[int], rns: np.random.RandomState, rating_column: Optional[str] = None, ) -> UserTrainTestInteractionPair: """Split the user x item data frame into a pair of sparse matrix (represented as a UserDataSet). Parameters ---------- df_: user x item interaction matrix. user_colname: The column name for the users. item_colname: The column name for the items. item_id_to_iid: The mapper from item id to item index. If not supplied, create own mapping from df_. heldout_ratio: The percentage of items (per-user) to be held out as a test(validation) ones. n_heldout: The maximal number of items (per-user) to be held out as a test(validation) ones. rns: The random state rating_column: The column for the rating values. If None, the rating values will be all equal (1), by default None Returns ------- UserDataSet Resulting train-test split dataset. """ df_ = df_[df_[item_colname].isin(item_ids)] item_indices = pd.Categorical(df_[item_colname], categories=item_ids).codes user_ids, user_indices = np.unique(df_[user_colname], return_inverse=True) if rating_column is not None: data = df_[rating_column].values else: data = np.ones(df_.shape[0], dtype=np.int32) X_all = sps.csr_matrix( (data, (user_indices, item_indices)), shape=(len(user_ids), len(item_ids)), ) X_learn, X_predict = rowwise_train_test_split( X_all, heldout_ratio, n_heldout, random_seed=rns.randint(-(2**31), 2**31 - 1), ) return UserTrainTestInteractionPair(user_ids, X_learn.tocsr(), X_predict.tocsr(), item_ids)
def test_split() -> None: warnings.simplefilter("always") X_1, X_2 = rowwise_train_test_split(X, test_ratio=0.5, random_seed=1) assert np.all((X - X_1 - X_2).toarray() == 0) # should have no overwrap assert np.all(X_1.multiply(X_2).toarray() == 0)
def split_train_test_userwise_random( df_: pd.DataFrame, user_colname: str, item_colname: str, item_ids: Union[List[Any], np.ndarray], heldout_ratio: float, n_heldout: Optional[int], rns: np.random.RandomState, rating_column: Optional[str] = None, ceil_n_heldout: bool = False, ) -> UserTrainTestInteractionPair: r"""Split the user x item data frame into a pair of sparse matrix (represented as a UserDataSet). Args ---------- df_: user x item interaction matrix. user_colname: The column name for the users. item_colname: The column name for the items. item_id_to_iid: The mapper from item id to item index. If not supplied, create own mapping from df_. heldout_ratio: The percentage of items (per-user) to be held out as a test(validation) ones. n_heldout: The maximal number of items (per-user) to be held out as a test(validation) ones. rns: The random state rating_column: The column for the rating values. If None, the rating values will be all equal (1), by default None ceil_n_heldout: If this is `True` and `n_heldout` is `None`, the number of test interaction for a given user `u` will be `ceil(N_u * heldout_ratio)` where `N_u` is the number of interactions fo `u`. If this is `False`, `floor(N_u * heldout_ratio)` will be used instead. Defaults to `False`. Returns ------- UserDataSet Resulting train-test split dataset. """ X_all, user_ids, _ = df_to_sparse( df_, user_colname=user_colname, item_colname=item_colname, item_ids=item_ids, rating_colname=rating_column, ) X_learn, X_predict = rowwise_train_test_split( X_all, heldout_ratio, n_heldout, random_state=rns, ceil_n_heldout=ceil_n_heldout) return UserTrainTestInteractionPair(user_ids, X_learn.tocsr(), X_predict.tocsr(), item_ids)
def test_split_fixed_n() -> None: X_1, X_2 = rowwise_train_test_split(X, test_ratio=0.5, n_test=1, random_seed=1) np.testing.assert_allclose(X.toarray(), (X_1 + X_2).toarray()) # should have no overwrap assert np.all(X_1.multiply(X_2).toarray() == 0) X_2.data[:] = 1 assert X_2.sum(axis=1).max() <= 1
def test_split() -> None: X_1, X_2 = rowwise_train_test_split(X, test_ratio=0.5, random_seed=1) np.testing.assert_allclose(X.toarray(), (X_1 + X_2).toarray()) # should have no overwrap assert np.all(X_1.multiply(X_2).toarray() == 0) nnzs = X.indptr[1:] - X.indptr[:-1] X_2_bin = X_2.copy() X_2_bin.data[:] = 1.0 X_2_nnzs = X_2_bin.sum(axis=1).A1 assert np.all((nnzs * 0.5) >= X_2_nnzs)