def test_Interactions_with_0_ratings(interactions_pandas, df_for_interactions_with_0_ratings): with pytest.warns(UserWarning): interactions_with_0s = Interactions( users=df_for_interactions_with_0_ratings['user_id'], items=df_for_interactions_with_0_ratings['item_id'], ratings=df_for_interactions_with_0_ratings['ratings'], check_num_negative_samples_is_valid=False) assert np.array_equal(interactions_pandas.toarray(), interactions_with_0s.toarray())
def test_all_lengths_bad(self, df_for_interactions_with_missing_ids): Interactions( users=df_for_interactions_with_missing_ids['user_id'][:-1], items=df_for_interactions_with_missing_ids['item_id'][:-1], ratings=df_for_interactions_with_missing_ids['ratings'][:-1], allow_missing_ids=True, check_num_negative_samples_is_valid=False)
def test_implicit_interactions(): return Interactions( users=[0, 0, 0, 1, 1, 1, 2, 2], items=[0, 1, 2, 1, 2, 3, 0, 2], ratings=[1, 1, 1, 1, 1, 1, 1, 1], check_num_negative_samples_is_valid=False, )
def movielens_implicit_interactions(movielens_implicit_df): return Interactions(users=movielens_implicit_df['user_id'], items=movielens_implicit_df['item_id'], ratings=movielens_implicit_df['rating'], num_negative_samples=10, max_number_of_samples_to_consider=200, allow_missing_ids=True)
def _subset_interactions( interactions: BaseInteractions, idxs: Iterable[int]) -> Union[ExplicitInteractions, Interactions]: idxs = np.array(idxs) coo_mat = coo_matrix( (interactions.mat.data[idxs], (interactions.mat.row[idxs], interactions.mat.col[idxs])), shape=(interactions.num_users, interactions.num_items)) # disable all ``Interactions`` checks for the data splits, since we assume the initial # ``Interactions`` object would have these checks already applied prior to the data split if isinstance(interactions, Interactions): return Interactions( mat=coo_mat, num_negative_samples=interactions.num_negative_samples, allow_missing_ids=True, remove_duplicate_user_item_pairs=False, num_users=interactions.num_users, num_items=interactions.num_items, check_num_negative_samples_is_valid=False, max_number_of_samples_to_consider=interactions. max_number_of_samples_to_consider, seed=interactions.seed, ) else: return ExplicitInteractions( mat=coo_mat, allow_missing_ids=True, remove_duplicate_user_item_pairs=False, num_users=interactions.num_users, num_items=interactions.num_items, )
def df_to_interactions(df: pd.DataFrame, user_col: str = 'user_id', item_col: str = 'item_id', ratings_col: Optional[str] = 'rating', **kwargs) -> Interactions: """ Helper function to convert a DataFrame to an ``Interactions`` object. Parameters ---------- df: pd.DataFrame Dataframe with columns for user IDs, item IDs, and (optionally) ratings user_col: str Column name for the user IDs item_col: str Column name for the item IDs ratings_col: str Column name for the ratings column. If ``None``, will default to ratings of all 1s **kwargs Keyword arguments to pass to ``Interactions`` Returns ------- interactions: collie.interactions.Interactions """ ratings = df[ratings_col] if ratings_col is not None else None return Interactions(users=df[user_col], items=df[item_col], ratings=ratings, **kwargs)
def test_Interactions_exact_negative_samples_num_negative_samples_too_large( self, ratings_matrix_for_interactions, ): with pytest.raises(AssertionError): Interactions(mat=ratings_matrix_for_interactions, max_number_of_samples_to_consider=200, num_negative_samples=8)
def test_Interactions_with_missing_ids( self, df_for_interactions_with_missing_ids, ratings_matrix_for_interactions_with_missing_ids, sparse_ratings_matrix_for_interactions_with_missing_ids): Interactions(mat=ratings_matrix_for_interactions_with_missing_ids, allow_missing_ids=True, check_num_negative_samples_is_valid=False) Interactions( mat=sparse_ratings_matrix_for_interactions_with_missing_ids, allow_missing_ids=True, check_num_negative_samples_is_valid=False) Interactions(users=df_for_interactions_with_missing_ids['user_id'], items=df_for_interactions_with_missing_ids['item_id'], ratings=df_for_interactions_with_missing_ids['ratings'], allow_missing_ids=True, check_num_negative_samples_is_valid=False)
def test_duplicate_user_item_pairs(self, interactions_pandas, df_for_interactions_with_duplicates): duplicated_interactions = Interactions( users=df_for_interactions_with_duplicates['user_id'], items=df_for_interactions_with_duplicates['item_id'], check_num_negative_samples_is_valid=False, remove_duplicate_user_item_pairs=False) assert duplicated_interactions.mat.getnnz( ) != interactions_pandas.mat.getnnz() non_duplicated_interactions = (Interactions( users=df_for_interactions_with_duplicates['user_id'], items=df_for_interactions_with_duplicates['item_id'], check_num_negative_samples_is_valid=False, remove_duplicate_user_item_pairs=True)) assert non_duplicated_interactions.mat.getnnz( ) == interactions_pandas.mat.getnnz()
def test_Interactions_with_missing_ids_raises_error( self, df_for_interactions_with_missing_ids, ratings_matrix_for_interactions_with_missing_ids, sparse_ratings_matrix_for_interactions_with_missing_ids): with pytest.raises(ValueError): Interactions(mat=ratings_matrix_for_interactions_with_missing_ids, check_num_negative_samples_is_valid=False) with pytest.raises(ValueError): Interactions( mat=sparse_ratings_matrix_for_interactions_with_missing_ids, check_num_negative_samples_is_valid=False) with pytest.raises(ValueError): Interactions( users=df_for_interactions_with_missing_ids['user_id'], items=df_for_interactions_with_missing_ids['item_id'], ratings=df_for_interactions_with_missing_ids['ratings'], check_num_negative_samples_is_valid=False)
def test_Interactions_approximate_negative_samples( self, ratings_matrix_for_interactions): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=0, seed=42) assert interactions.positive_items == {} for _ in range(3): _, negative_samples = interactions[0] assert len(negative_samples) == 3
def test_Interactions_approximate_negative_samples_partway_through( self, ratings_matrix_for_interactions, ): with pytest.warns(UserWarning): interactions = Interactions( mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=1, seed=42) assert interactions.positive_items != {} for _ in range(3): _, negative_samples = interactions[0] assert len(negative_samples) == 3
def test_Interactions_approximate_negative_samples_many_users( self, ratings_matrix_for_interactions, ): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=0, seed=42) assert interactions.positive_items == {} for _ in range(3): _, negative_samples = interactions[list( range(NUM_USERS_TO_GENERATE))] assert len(negative_samples) == NUM_USERS_TO_GENERATE for negative_sample in negative_samples: assert len(negative_sample) == NUM_NEGATIVE_SAMPLES
def test_Interactions_exact_negative_samples_many_users( self, ratings_matrix_for_interactions): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=200, seed=42) assert interactions.positive_items != {} for _ in range(10): (user_ids, _), negative_samples = interactions[list( range(NUM_USERS_TO_GENERATE))] assert len(negative_samples) == NUM_USERS_TO_GENERATE for idx, user_id in enumerate(user_ids): assert len(negative_samples[idx]) == NUM_NEGATIVE_SAMPLES for negative_sample in negative_samples[idx]: assert (negative_sample.item() not in ratings_matrix_for_interactions[user_id]. nonzero()[0])
def test_Interactions_exact_negative_samples( self, ratings_matrix_for_interactions): interactions = Interactions(mat=ratings_matrix_for_interactions, num_negative_samples=NUM_NEGATIVE_SAMPLES, max_number_of_samples_to_consider=200, seed=42) assert interactions.positive_items != {} all_negative_samples = list() for _ in range(10): _, negative_samples = interactions[0] assert len(negative_samples) == NUM_NEGATIVE_SAMPLES for negative_sample in negative_samples: assert negative_sample.item( ) not in ratings_matrix_for_interactions[0].nonzero()[0] all_negative_samples += negative_samples.tolist() assert len(set(all_negative_samples)) > NUM_NEGATIVE_SAMPLES
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None: """ Retrieve and split data, train and evaluate a model, and save it. From the terminal, you can run this script with: .. code-block:: bash python collie/movielens/run.py --epochs 20 Parameters ---------- epochs: int Number of epochs for model training gpus: int Number of gpus to train on """ t = Timer() t.timecheck(' 1.0 - retrieving MovieLens 100K dataset') df = read_movielens_df(decrement_ids=True) t.timecheck(' 1.0 complete') t.timecheck(' 2.0 - splitting data') df_imp = convert_to_implicit(df) interactions = Interactions(users=df_imp['user_id'], items=df_imp['item_id'], allow_missing_ids=True) train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1) train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False) t.timecheck(' 2.0 complete') t.timecheck(' 3.0 - training the model') model = MatrixFactorizationModel(train=train_loader, val=val_loader, dropout_p=0.05, loss='adaptive', lr=5e-2, embedding_dim=10, optimizer='adam', weight_decay=1e-7) trainer = CollieTrainer(model=model, gpus=gpus, max_epochs=epochs, deterministic=True, logger=False, checkpoint_callback=False, callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')], weights_summary='full', terminate_on_nan=True) trainer.fit(model) model.eval() t.timecheck('\n 3.0 complete') t.timecheck(' 4.0 - evaluating model') auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10) print(f'AUC: {auc_score}') print(f'MRR: {mrr_score}') print(f'MAP@10: {mapk_score}') t.timecheck(' 4.0 complete') t.timecheck(' 5.0 - saving model') absolute_data_path = DATA_PATH / 'fitted_model' model.save_model(absolute_data_path) t.timecheck(' 5.0 complete')
def interactions_sparse_matrix(sparse_ratings_matrix_for_interactions): return Interactions(mat=sparse_ratings_matrix_for_interactions, check_num_negative_samples_is_valid=False)
def interactions_pandas(df_for_interactions): return Interactions(users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], ratings=df_for_interactions['ratings'], check_num_negative_samples_is_valid=False)
def test_users_None(self, df_for_interactions): with pytest.raises(AssertionError): Interactions(users=None, items=df_for_interactions['item_id'], ratings=df_for_interactions['ratings'], check_num_negative_samples_is_valid=False)
def test_ratings_None_but_its_okay(self, df_for_interactions): Interactions(users=df_for_interactions['user_id'], items=df_for_interactions['item_id'], ratings=None, check_num_negative_samples_is_valid=False)