Пример #1
0
def test_Interactions_with_0_ratings(interactions_pandas,
                                     df_for_interactions_with_0_ratings):
    with pytest.warns(UserWarning):
        interactions_with_0s = Interactions(
            users=df_for_interactions_with_0_ratings['user_id'],
            items=df_for_interactions_with_0_ratings['item_id'],
            ratings=df_for_interactions_with_0_ratings['ratings'],
            check_num_negative_samples_is_valid=False)

    assert np.array_equal(interactions_pandas.toarray(),
                          interactions_with_0s.toarray())
Пример #2
0
 def test_all_lengths_bad(self, df_for_interactions_with_missing_ids):
     Interactions(
         users=df_for_interactions_with_missing_ids['user_id'][:-1],
         items=df_for_interactions_with_missing_ids['item_id'][:-1],
         ratings=df_for_interactions_with_missing_ids['ratings'][:-1],
         allow_missing_ids=True,
         check_num_negative_samples_is_valid=False)
Пример #3
0
def test_implicit_interactions():
    return Interactions(
        users=[0, 0, 0, 1, 1, 1, 2, 2],
        items=[0, 1, 2, 1, 2, 3, 0, 2],
        ratings=[1, 1, 1, 1, 1, 1, 1, 1],
        check_num_negative_samples_is_valid=False,
    )
Пример #4
0
def movielens_implicit_interactions(movielens_implicit_df):
    return Interactions(users=movielens_implicit_df['user_id'],
                        items=movielens_implicit_df['item_id'],
                        ratings=movielens_implicit_df['rating'],
                        num_negative_samples=10,
                        max_number_of_samples_to_consider=200,
                        allow_missing_ids=True)
Пример #5
0
def _subset_interactions(
        interactions: BaseInteractions,
        idxs: Iterable[int]) -> Union[ExplicitInteractions, Interactions]:
    idxs = np.array(idxs)

    coo_mat = coo_matrix(
        (interactions.mat.data[idxs],
         (interactions.mat.row[idxs], interactions.mat.col[idxs])),
        shape=(interactions.num_users, interactions.num_items))

    # disable all ``Interactions`` checks for the data splits, since we assume the initial
    # ``Interactions`` object would have these checks already applied prior to the data split
    if isinstance(interactions, Interactions):
        return Interactions(
            mat=coo_mat,
            num_negative_samples=interactions.num_negative_samples,
            allow_missing_ids=True,
            remove_duplicate_user_item_pairs=False,
            num_users=interactions.num_users,
            num_items=interactions.num_items,
            check_num_negative_samples_is_valid=False,
            max_number_of_samples_to_consider=interactions.
            max_number_of_samples_to_consider,
            seed=interactions.seed,
        )
    else:
        return ExplicitInteractions(
            mat=coo_mat,
            allow_missing_ids=True,
            remove_duplicate_user_item_pairs=False,
            num_users=interactions.num_users,
            num_items=interactions.num_items,
        )
Пример #6
0
def df_to_interactions(df: pd.DataFrame,
                       user_col: str = 'user_id',
                       item_col: str = 'item_id',
                       ratings_col: Optional[str] = 'rating',
                       **kwargs) -> Interactions:
    """
    Helper function to convert a DataFrame to an ``Interactions`` object.

    Parameters
    ----------
    df: pd.DataFrame
        Dataframe with columns for user IDs, item IDs, and (optionally) ratings
    user_col: str
        Column name for the user IDs
    item_col: str
        Column name for the item IDs
    ratings_col: str
        Column name for the ratings column. If ``None``, will default to ratings of all 1s
    **kwargs
        Keyword arguments to pass to ``Interactions``

    Returns
    -------
    interactions: collie.interactions.Interactions

    """
    ratings = df[ratings_col] if ratings_col is not None else None

    return Interactions(users=df[user_col],
                        items=df[item_col],
                        ratings=ratings,
                        **kwargs)
Пример #7
0
 def test_Interactions_exact_negative_samples_num_negative_samples_too_large(
     self,
     ratings_matrix_for_interactions,
 ):
     with pytest.raises(AssertionError):
         Interactions(mat=ratings_matrix_for_interactions,
                      max_number_of_samples_to_consider=200,
                      num_negative_samples=8)
Пример #8
0
    def test_Interactions_with_missing_ids(
            self, df_for_interactions_with_missing_ids,
            ratings_matrix_for_interactions_with_missing_ids,
            sparse_ratings_matrix_for_interactions_with_missing_ids):
        Interactions(mat=ratings_matrix_for_interactions_with_missing_ids,
                     allow_missing_ids=True,
                     check_num_negative_samples_is_valid=False)

        Interactions(
            mat=sparse_ratings_matrix_for_interactions_with_missing_ids,
            allow_missing_ids=True,
            check_num_negative_samples_is_valid=False)

        Interactions(users=df_for_interactions_with_missing_ids['user_id'],
                     items=df_for_interactions_with_missing_ids['item_id'],
                     ratings=df_for_interactions_with_missing_ids['ratings'],
                     allow_missing_ids=True,
                     check_num_negative_samples_is_valid=False)
Пример #9
0
    def test_duplicate_user_item_pairs(self, interactions_pandas,
                                       df_for_interactions_with_duplicates):
        duplicated_interactions = Interactions(
            users=df_for_interactions_with_duplicates['user_id'],
            items=df_for_interactions_with_duplicates['item_id'],
            check_num_negative_samples_is_valid=False,
            remove_duplicate_user_item_pairs=False)

        assert duplicated_interactions.mat.getnnz(
        ) != interactions_pandas.mat.getnnz()

        non_duplicated_interactions = (Interactions(
            users=df_for_interactions_with_duplicates['user_id'],
            items=df_for_interactions_with_duplicates['item_id'],
            check_num_negative_samples_is_valid=False,
            remove_duplicate_user_item_pairs=True))

        assert non_duplicated_interactions.mat.getnnz(
        ) == interactions_pandas.mat.getnnz()
Пример #10
0
    def test_Interactions_with_missing_ids_raises_error(
            self, df_for_interactions_with_missing_ids,
            ratings_matrix_for_interactions_with_missing_ids,
            sparse_ratings_matrix_for_interactions_with_missing_ids):
        with pytest.raises(ValueError):
            Interactions(mat=ratings_matrix_for_interactions_with_missing_ids,
                         check_num_negative_samples_is_valid=False)

        with pytest.raises(ValueError):
            Interactions(
                mat=sparse_ratings_matrix_for_interactions_with_missing_ids,
                check_num_negative_samples_is_valid=False)

        with pytest.raises(ValueError):
            Interactions(
                users=df_for_interactions_with_missing_ids['user_id'],
                items=df_for_interactions_with_missing_ids['item_id'],
                ratings=df_for_interactions_with_missing_ids['ratings'],
                check_num_negative_samples_is_valid=False)
Пример #11
0
    def test_Interactions_approximate_negative_samples(
            self, ratings_matrix_for_interactions):
        interactions = Interactions(mat=ratings_matrix_for_interactions,
                                    num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                    max_number_of_samples_to_consider=0,
                                    seed=42)

        assert interactions.positive_items == {}

        for _ in range(3):
            _, negative_samples = interactions[0]

            assert len(negative_samples) == 3
Пример #12
0
    def test_Interactions_approximate_negative_samples_partway_through(
        self,
        ratings_matrix_for_interactions,
    ):
        with pytest.warns(UserWarning):
            interactions = Interactions(
                mat=ratings_matrix_for_interactions,
                num_negative_samples=NUM_NEGATIVE_SAMPLES,
                max_number_of_samples_to_consider=1,
                seed=42)

        assert interactions.positive_items != {}

        for _ in range(3):
            _, negative_samples = interactions[0]

            assert len(negative_samples) == 3
Пример #13
0
    def test_Interactions_approximate_negative_samples_many_users(
        self,
        ratings_matrix_for_interactions,
    ):
        interactions = Interactions(mat=ratings_matrix_for_interactions,
                                    num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                    max_number_of_samples_to_consider=0,
                                    seed=42)

        assert interactions.positive_items == {}

        for _ in range(3):
            _, negative_samples = interactions[list(
                range(NUM_USERS_TO_GENERATE))]

            assert len(negative_samples) == NUM_USERS_TO_GENERATE

            for negative_sample in negative_samples:
                assert len(negative_sample) == NUM_NEGATIVE_SAMPLES
Пример #14
0
    def test_Interactions_exact_negative_samples_many_users(
            self, ratings_matrix_for_interactions):
        interactions = Interactions(mat=ratings_matrix_for_interactions,
                                    num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                    max_number_of_samples_to_consider=200,
                                    seed=42)

        assert interactions.positive_items != {}

        for _ in range(10):
            (user_ids, _), negative_samples = interactions[list(
                range(NUM_USERS_TO_GENERATE))]

            assert len(negative_samples) == NUM_USERS_TO_GENERATE

            for idx, user_id in enumerate(user_ids):
                assert len(negative_samples[idx]) == NUM_NEGATIVE_SAMPLES

                for negative_sample in negative_samples[idx]:
                    assert (negative_sample.item()
                            not in ratings_matrix_for_interactions[user_id].
                            nonzero()[0])
Пример #15
0
    def test_Interactions_exact_negative_samples(
            self, ratings_matrix_for_interactions):
        interactions = Interactions(mat=ratings_matrix_for_interactions,
                                    num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                    max_number_of_samples_to_consider=200,
                                    seed=42)

        assert interactions.positive_items != {}

        all_negative_samples = list()
        for _ in range(10):
            _, negative_samples = interactions[0]

            assert len(negative_samples) == NUM_NEGATIVE_SAMPLES

            for negative_sample in negative_samples:
                assert negative_sample.item(
                ) not in ratings_matrix_for_interactions[0].nonzero()[0]

            all_negative_samples += negative_samples.tolist()

        assert len(set(all_negative_samples)) > NUM_NEGATIVE_SAMPLES
Пример #16
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie/movielens/run.py  --epochs 20

    Parameters
    ----------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(model=model,
                            gpus=gpus,
                            max_epochs=epochs,
                            deterministic=True,
                            logger=False,
                            checkpoint_callback=False,
                            callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
                            weights_summary='full',
                            terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')
Пример #17
0
def interactions_sparse_matrix(sparse_ratings_matrix_for_interactions):
    return Interactions(mat=sparse_ratings_matrix_for_interactions,
                        check_num_negative_samples_is_valid=False)
Пример #18
0
def interactions_pandas(df_for_interactions):
    return Interactions(users=df_for_interactions['user_id'],
                        items=df_for_interactions['item_id'],
                        ratings=df_for_interactions['ratings'],
                        check_num_negative_samples_is_valid=False)
Пример #19
0
 def test_users_None(self, df_for_interactions):
     with pytest.raises(AssertionError):
         Interactions(users=None,
                      items=df_for_interactions['item_id'],
                      ratings=df_for_interactions['ratings'],
                      check_num_negative_samples_is_valid=False)
Пример #20
0
 def test_ratings_None_but_its_okay(self, df_for_interactions):
     Interactions(users=df_for_interactions['user_id'],
                  items=df_for_interactions['item_id'],
                  ratings=None,
                  check_num_negative_samples_is_valid=False)