def test_Interactions_with_0_ratings(interactions_pandas, df_for_interactions_with_0_ratings): with pytest.warns(UserWarning): interactions_with_0s = Interactions(users=df_for_interactions_with_0_ratings['user_id'], items=df_for_interactions_with_0_ratings['item_id'], ratings=df_for_interactions_with_0_ratings['ratings'], check_num_negative_samples_is_valid=False) assert np.array_equal(interactions_pandas.toarray(), interactions_with_0s.toarray())
def test_items_length_bad(self, df_for_interactions_with_missing_ids): with pytest.raises(ValueError): Interactions(users=df_for_interactions_with_missing_ids['user_id'], items=df_for_interactions_with_missing_ids['item_id'][:-1], ratings=df_for_interactions_with_missing_ids['ratings'], allow_missing_ids=True, check_num_negative_samples_is_valid=False)
def movielens_implicit_interactions(movielens_implicit_df): return Interactions(users=movielens_implicit_df['user_id'], items=movielens_implicit_df['item_id'], ratings=movielens_implicit_df['rating'], num_negative_samples=10, max_number_of_samples_to_consider=200, allow_missing_ids=True)
def test_implicit_interactions(): return Interactions( users=[0, 0, 0, 1, 1, 1, 2, 2], items=[0, 1, 2, 1, 2, 3, 0, 2], ratings=[1, 1, 1, 1, 1, 1, 1, 1], check_num_negative_samples_is_valid=False, )
def df_to_interactions(df: pd.DataFrame, user_col: str = 'user_id', item_col: str = 'item_id', ratings_col: Optional[str] = 'rating', **kwargs) -> Interactions: """ Helper function to convert a DataFrame to an ``Interactions`` object. Parameters ------------- df: pd.DataFrame Dataframe with columns for user IDs, item IDs, and (optionally) ratings user_col: str Column name for the user IDs item_col: str Column name for the item IDs ratings_col: str Column name for the ratings column. If ``None``, will default to ratings of all 1s **kwargs Keyword arguments to pass to ``Interactions`` Returns ------------- interactions: collie_recs.interactions.Interactions """ ratings = df[ratings_col] if ratings_col is not None else None return Interactions(users=df[user_col], items=df[item_col], ratings=ratings, **kwargs)
def test_duplicate_user_item_pairs(self, interactions_pandas, df_for_interactions_with_duplicates): duplicated_interactions = Interactions(users=df_for_interactions_with_duplicates['user_id'], items=df_for_interactions_with_duplicates['item_id'], check_num_negative_samples_is_valid=False) assert duplicated_interactions.mat.getnnz() == interactions_pandas.mat.getnnz()
def test_Interactions_exact_negative_samples_num_negative_samples_too_large( self, ratings_matrix_for_interactions, ): with pytest.raises(AssertionError): Interactions(mat=ratings_matrix_for_interactions, max_number_of_samples_to_consider=200, num_negative_samples=8)
def test_Interactions_with_missing_ids_raises_error( self, df_for_interactions_with_missing_ids, ratings_matrix_for_interactions_with_missing_ids, sparse_ratings_matrix_for_interactions_with_missing_ids ): with pytest.raises(ValueError): Interactions(mat=ratings_matrix_for_interactions_with_missing_ids, check_num_negative_samples_is_valid=False) with pytest.raises(ValueError): Interactions(mat=sparse_ratings_matrix_for_interactions_with_missing_ids, check_num_negative_samples_is_valid=False) with pytest.raises(ValueError): Interactions(users=df_for_interactions_with_missing_ids['user_id'], items=df_for_interactions_with_missing_ids['item_id'], ratings=df_for_interactions_with_missing_ids['ratings'], check_num_negative_samples_is_valid=False)
def test_Interactions_with_missing_ids( self, df_for_interactions_with_missing_ids, ratings_matrix_for_interactions_with_missing_ids, sparse_ratings_matrix_for_interactions_with_missing_ids ): Interactions(mat=ratings_matrix_for_interactions_with_missing_ids, allow_missing_ids=True, check_num_negative_samples_is_valid=False) Interactions(mat=sparse_ratings_matrix_for_interactions_with_missing_ids, allow_missing_ids=True, check_num_negative_samples_is_valid=False) Interactions(users=df_for_interactions_with_missing_ids['user_id'], items=df_for_interactions_with_missing_ids['item_id'], ratings=df_for_interactions_with_missing_ids['ratings'], allow_missing_ids=True, check_num_negative_samples_is_valid=False)
def test_Interactions_approximate_negative_samples(self, ratings_matrix_for_interactions): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=0, seed=42) assert interactions.positive_items == {} for _ in range(3): _, negative_samples = interactions[0] assert len(negative_samples) == 3
def test_Interactions(ratings_matrix_for_interactions, sparse_ratings_matrix_for_interactions, df_for_interactions): interactions_1 = Interactions(mat=ratings_matrix_for_interactions, check_num_negative_samples_is_valid=False) interactions_2 = Interactions(mat=sparse_ratings_matrix_for_interactions, check_num_negative_samples_is_valid=False) interactions_3 = Interactions(users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], ratings=df_for_interactions['rating'], check_num_negative_samples_is_valid=False) np.testing.assert_equal(interactions_1.toarray(), interactions_2.toarray()) np.testing.assert_equal(interactions_1.toarray(), interactions_3.toarray()) assert interactions_1.num_users == interactions_2.num_users == interactions_3.num_users assert interactions_1.num_items == interactions_2.num_items == interactions_3.num_items assert (interactions_1.num_interactions == interactions_2.num_interactions == interactions_3.num_interactions)
def test_Interactions_approximate_negative_samples_partway_through( ratings_matrix_for_interactions): with pytest.warns(UserWarning): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=1, seed=42) assert interactions.positive_items != {} for _ in range(3): _, negative_samples = interactions[0] assert len(negative_samples) == 3
def interactions_to_split_with_a_user_with_only_one_interaction(): df = pd.DataFrame( data={ 'user_id': [0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4], 'item_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 2, 4, 5], 'rating': [1, 2, 3, 4, 5, 4, 3, 2, 1, 1, 2, 3, 4, 2, 3, 4, 5, 1, 2, 3, 5, 4] }) return Interactions(users=df['user_id'], items=df['item_id'], ratings=df['rating'], check_num_negative_samples_is_valid=False)
def test_Interactions_approximate_negative_samples_many_users( ratings_matrix_for_interactions): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=0, seed=42) assert interactions.positive_items == {} for _ in range(3): _, negative_samples = interactions[list(range(NUM_USERS_TO_GENERATE))] assert len(negative_samples) == NUM_USERS_TO_GENERATE for negative_sample in negative_samples: assert len(negative_sample) == NUM_NEGATIVE_SAMPLES
def _subset_interactions(interactions: Interactions, idxs: Iterable[int]) -> Interactions: idxs = np.array(idxs) coo_mat = coo_matrix( (interactions.mat.data[idxs], (interactions.mat.row[idxs], interactions.mat.col[idxs])), shape=(interactions.num_users, interactions.num_items) ) return Interactions( mat=coo_mat, num_negative_samples=interactions.num_negative_samples, allow_missing_ids=True, num_users=interactions.num_users, num_items=interactions.num_items, check_num_negative_samples_is_valid=False, max_number_of_samples_to_consider=interactions.max_number_of_samples_to_consider, seed=interactions.seed, )
def test_Interactions_exact_negative_samples(self, ratings_matrix_for_interactions): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=200, seed=42) assert interactions.positive_items != {} all_negative_samples = list() for _ in range(10): _, negative_samples = interactions[0] assert len(negative_samples) == NUM_NEGATIVE_SAMPLES for negative_sample in negative_samples: assert negative_sample.item() not in ratings_matrix_for_interactions[0].nonzero()[0] all_negative_samples += negative_samples.tolist() assert len(set(all_negative_samples)) > NUM_NEGATIVE_SAMPLES
def test_Interactions_exact_negative_samples_many_users(self, ratings_matrix_for_interactions): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=200, seed=42) assert interactions.positive_items != {} for _ in range(10): (user_ids, _), negative_samples = interactions[list(range(NUM_USERS_TO_GENERATE))] assert len(negative_samples) == NUM_USERS_TO_GENERATE for idx, user_id in enumerate(user_ids): assert len(negative_samples[idx]) == NUM_NEGATIVE_SAMPLES for negative_sample in negative_samples[idx]: assert ( negative_sample.item() not in ratings_matrix_for_interactions[user_id].nonzero()[0] )
def test_stratified_split(interactions_to_split): train_expected_df = pd.DataFrame( data={ 'user_id': [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 4, 4], 'item_id': [1, 2, 3, 4, 6, 8, 1, 2, 3, 4, 2, 4, 5], 'rating': [2, 3, 4, 5, 3, 1, 1, 2, 4, 5, 5, 5, 4], }) train_expected = Interactions( mat=coo_matrix( ( train_expected_df['rating'], (train_expected_df['user_id'], train_expected_df['item_id']), ), shape=(interactions_to_split.num_users, interactions_to_split.num_items), ), allow_missing_ids=True, check_num_negative_samples_is_valid=False, ) validate_expected_df = pd.DataFrame( data={ 'user_id': [0, 1, 2, 3, 4], 'item_id': [7, 3, 2, 1, 2], 'rating': [2, 3, 3, 1, 3], }) validate_expected = Interactions( mat=coo_matrix( ( validate_expected_df['rating'], (validate_expected_df['user_id'], validate_expected_df['item_id']), ), shape=(interactions_to_split.num_users, interactions_to_split.num_items), ), allow_missing_ids=True, check_num_negative_samples_is_valid=False, ) test_expected_df = pd.DataFrame( data={ 'user_id': [0, 0, 1, 2, 3, 4], 'item_id': [0, 5, 4, 1, 4, 1], 'rating': [1, 4, 4, 2, 4, 2], }) test_expected = Interactions( mat=coo_matrix( ( test_expected_df['rating'], (test_expected_df['user_id'], test_expected_df['item_id']), ), shape=(interactions_to_split.num_users, interactions_to_split.num_items), ), allow_missing_ids=True, check_num_negative_samples_is_valid=False, ) (train_actual, validate_actual, test_actual) = stratified_split(interactions=interactions_to_split, val_p=0.1, test_p=0.2, seed=46) np.testing.assert_array_equal(train_actual.toarray(), train_expected.toarray()) np.testing.assert_array_equal(validate_actual.toarray(), validate_expected.toarray()) np.testing.assert_array_equal(test_actual.toarray(), test_expected.toarray()) assert (train_actual.num_users == train_expected.num_users == validate_actual.num_users == validate_expected.num_users == test_actual.num_users == test_expected.num_users) assert (train_actual.num_items == train_expected.num_items == validate_actual.num_items == validate_expected.num_items == test_actual.num_items == test_expected.num_items)
def interactions_sparse_matrix(sparse_ratings_matrix_for_interactions): return Interactions(mat=sparse_ratings_matrix_for_interactions, check_num_negative_samples_is_valid=False)
def interactions_pandas(df_for_interactions): return Interactions(users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], ratings=df_for_interactions['ratings'], check_num_negative_samples_is_valid=False)
def test_ratings_None_but_its_okay(self, df_for_interactions): Interactions(users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], ratings=None, check_num_negative_samples_is_valid=False)
def test_users_None(self, df_for_interactions): with pytest.raises(AssertionError): Interactions(users=None, items=df_for_interactions['item_id'], ratings=df_for_interactions['ratings'], check_num_negative_samples_is_valid=False)
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None: """ Retrieve and split data, train and evaluate a model, and save it. From the terminal, you can run this script with: .. code-block:: bash python collie_recs/movielens/run.py --epochs 20 Parameters ------------- epochs: int Number of epochs for model training gpus: int Number of gpus to train on """ t = Timer() t.timecheck(' 1.0 - retrieving MovieLens 100K dataset') df = read_movielens_df(decrement_ids=True) t.timecheck(' 1.0 complete') t.timecheck(' 2.0 - splitting data') df_imp = convert_to_implicit(df) interactions = Interactions(users=df_imp['user_id'], items=df_imp['item_id'], allow_missing_ids=True) train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1) train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False) t.timecheck(' 2.0 complete') t.timecheck(' 3.0 - training the model') model = MatrixFactorizationModel(train=train_loader, val=val_loader, dropout_p=0.05, loss='adaptive', lr=5e-2, embedding_dim=10, optimizer='adam', weight_decay=1e-7) trainer = CollieTrainer( model=model, gpus=gpus, max_epochs=epochs, deterministic=True, logger=False, checkpoint_callback=False, callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')], weights_summary='full', terminate_on_nan=True) trainer.fit(model) model.eval() t.timecheck('\n 3.0 complete') t.timecheck(' 4.0 - evaluating model') auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10) print(f'AUC: {auc_score}') print(f'MRR: {mrr_score}') print(f'MAP@10: {mapk_score}') t.timecheck(' 4.0 complete') t.timecheck(' 5.0 - saving model') absolute_data_path = DATA_PATH / 'fitted_model' model.save_model(absolute_data_path) t.timecheck(' 5.0 complete')