示例#1
0
def test_Interactions_with_0_ratings(interactions_pandas, df_for_interactions_with_0_ratings):
    with pytest.warns(UserWarning):
        interactions_with_0s = Interactions(users=df_for_interactions_with_0_ratings['user_id'],
                                            items=df_for_interactions_with_0_ratings['item_id'],
                                            ratings=df_for_interactions_with_0_ratings['ratings'],
                                            check_num_negative_samples_is_valid=False)

    assert np.array_equal(interactions_pandas.toarray(), interactions_with_0s.toarray())
示例#2
0
 def test_items_length_bad(self, df_for_interactions_with_missing_ids):
     with pytest.raises(ValueError):
         Interactions(users=df_for_interactions_with_missing_ids['user_id'],
                      items=df_for_interactions_with_missing_ids['item_id'][:-1],
                      ratings=df_for_interactions_with_missing_ids['ratings'],
                      allow_missing_ids=True,
                      check_num_negative_samples_is_valid=False)
示例#3
0
def movielens_implicit_interactions(movielens_implicit_df):
    return Interactions(users=movielens_implicit_df['user_id'],
                        items=movielens_implicit_df['item_id'],
                        ratings=movielens_implicit_df['rating'],
                        num_negative_samples=10,
                        max_number_of_samples_to_consider=200,
                        allow_missing_ids=True)
示例#4
0
def test_implicit_interactions():
    return Interactions(
        users=[0, 0, 0, 1, 1, 1, 2, 2],
        items=[0, 1, 2, 1, 2, 3, 0, 2],
        ratings=[1, 1, 1, 1, 1, 1, 1, 1],
        check_num_negative_samples_is_valid=False,
    )
示例#5
0
def df_to_interactions(df: pd.DataFrame,
                       user_col: str = 'user_id',
                       item_col: str = 'item_id',
                       ratings_col: Optional[str] = 'rating',
                       **kwargs) -> Interactions:
    """
    Helper function to convert a DataFrame to an ``Interactions`` object.

    Parameters
    -------------
    df: pd.DataFrame
        Dataframe with columns for user IDs, item IDs, and (optionally) ratings
    user_col: str
        Column name for the user IDs
    item_col: str
        Column name for the item IDs
    ratings_col: str
        Column name for the ratings column. If ``None``, will default to ratings of all 1s
    **kwargs
        Keyword arguments to pass to ``Interactions``

    Returns
    -------------
    interactions: collie_recs.interactions.Interactions

    """
    ratings = df[ratings_col] if ratings_col is not None else None

    return Interactions(users=df[user_col],
                        items=df[item_col],
                        ratings=ratings,
                        **kwargs)
示例#6
0
    def test_duplicate_user_item_pairs(self,
                                       interactions_pandas,
                                       df_for_interactions_with_duplicates):
        duplicated_interactions = Interactions(users=df_for_interactions_with_duplicates['user_id'],
                                               items=df_for_interactions_with_duplicates['item_id'],
                                               check_num_negative_samples_is_valid=False)

        assert duplicated_interactions.mat.getnnz() == interactions_pandas.mat.getnnz()
示例#7
0
 def test_Interactions_exact_negative_samples_num_negative_samples_too_large(
     self,
     ratings_matrix_for_interactions,
 ):
     with pytest.raises(AssertionError):
         Interactions(mat=ratings_matrix_for_interactions,
                      max_number_of_samples_to_consider=200,
                      num_negative_samples=8)
示例#8
0
    def test_Interactions_with_missing_ids_raises_error(
        self,
        df_for_interactions_with_missing_ids,
        ratings_matrix_for_interactions_with_missing_ids,
        sparse_ratings_matrix_for_interactions_with_missing_ids
    ):
        with pytest.raises(ValueError):
            Interactions(mat=ratings_matrix_for_interactions_with_missing_ids,
                         check_num_negative_samples_is_valid=False)

        with pytest.raises(ValueError):
            Interactions(mat=sparse_ratings_matrix_for_interactions_with_missing_ids,
                         check_num_negative_samples_is_valid=False)

        with pytest.raises(ValueError):
            Interactions(users=df_for_interactions_with_missing_ids['user_id'],
                         items=df_for_interactions_with_missing_ids['item_id'],
                         ratings=df_for_interactions_with_missing_ids['ratings'],
                         check_num_negative_samples_is_valid=False)
示例#9
0
    def test_Interactions_with_missing_ids(
        self,
        df_for_interactions_with_missing_ids,
        ratings_matrix_for_interactions_with_missing_ids,
        sparse_ratings_matrix_for_interactions_with_missing_ids
    ):
        Interactions(mat=ratings_matrix_for_interactions_with_missing_ids,
                     allow_missing_ids=True,
                     check_num_negative_samples_is_valid=False)

        Interactions(mat=sparse_ratings_matrix_for_interactions_with_missing_ids,
                     allow_missing_ids=True,
                     check_num_negative_samples_is_valid=False)

        Interactions(users=df_for_interactions_with_missing_ids['user_id'],
                     items=df_for_interactions_with_missing_ids['item_id'],
                     ratings=df_for_interactions_with_missing_ids['ratings'],
                     allow_missing_ids=True,
                     check_num_negative_samples_is_valid=False)
示例#10
0
    def test_Interactions_approximate_negative_samples(self, ratings_matrix_for_interactions):
        interactions = Interactions(mat=ratings_matrix_for_interactions,
                                    num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                    max_number_of_samples_to_consider=0,
                                    seed=42)

        assert interactions.positive_items == {}

        for _ in range(3):
            _, negative_samples = interactions[0]

            assert len(negative_samples) == 3
示例#11
0
def test_Interactions(ratings_matrix_for_interactions,
                      sparse_ratings_matrix_for_interactions,
                      df_for_interactions):
    interactions_1 = Interactions(mat=ratings_matrix_for_interactions,
                                  check_num_negative_samples_is_valid=False)
    interactions_2 = Interactions(mat=sparse_ratings_matrix_for_interactions,
                                  check_num_negative_samples_is_valid=False)
    interactions_3 = Interactions(users=df_for_interactions['user_id'],
                                  items=df_for_interactions['item_id'],
                                  ratings=df_for_interactions['rating'],
                                  check_num_negative_samples_is_valid=False)

    np.testing.assert_equal(interactions_1.toarray(), interactions_2.toarray())
    np.testing.assert_equal(interactions_1.toarray(), interactions_3.toarray())
    assert interactions_1.num_users == interactions_2.num_users == interactions_3.num_users
    assert interactions_1.num_items == interactions_2.num_items == interactions_3.num_items
    assert (interactions_1.num_interactions == interactions_2.num_interactions
            == interactions_3.num_interactions)
示例#12
0
def test_Interactions_approximate_negative_samples_partway_through(
        ratings_matrix_for_interactions):
    with pytest.warns(UserWarning):
        interactions = Interactions(mat=ratings_matrix_for_interactions,
                                    num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                    max_number_of_samples_to_consider=1,
                                    seed=42)

    assert interactions.positive_items != {}

    for _ in range(3):
        _, negative_samples = interactions[0]

        assert len(negative_samples) == 3
示例#13
0
def interactions_to_split_with_a_user_with_only_one_interaction():
    df = pd.DataFrame(
        data={
            'user_id':
            [0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4],
            'item_id':
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 2, 4, 5],
            'rating':
            [1, 2, 3, 4, 5, 4, 3, 2, 1, 1, 2, 3, 4, 2, 3, 4, 5, 1, 2, 3, 5, 4]
        })

    return Interactions(users=df['user_id'],
                        items=df['item_id'],
                        ratings=df['rating'],
                        check_num_negative_samples_is_valid=False)
示例#14
0
def test_Interactions_approximate_negative_samples_many_users(
        ratings_matrix_for_interactions):
    interactions = Interactions(mat=ratings_matrix_for_interactions,
                                num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                max_number_of_samples_to_consider=0,
                                seed=42)

    assert interactions.positive_items == {}

    for _ in range(3):
        _, negative_samples = interactions[list(range(NUM_USERS_TO_GENERATE))]

        assert len(negative_samples) == NUM_USERS_TO_GENERATE

        for negative_sample in negative_samples:
            assert len(negative_sample) == NUM_NEGATIVE_SAMPLES
示例#15
0
def _subset_interactions(interactions: Interactions, idxs: Iterable[int]) -> Interactions:
    idxs = np.array(idxs)

    coo_mat = coo_matrix(
        (interactions.mat.data[idxs], (interactions.mat.row[idxs],
                                       interactions.mat.col[idxs])),
        shape=(interactions.num_users, interactions.num_items)
    )

    return Interactions(
        mat=coo_mat,
        num_negative_samples=interactions.num_negative_samples,
        allow_missing_ids=True,
        num_users=interactions.num_users,
        num_items=interactions.num_items,
        check_num_negative_samples_is_valid=False,
        max_number_of_samples_to_consider=interactions.max_number_of_samples_to_consider,
        seed=interactions.seed,
    )
示例#16
0
    def test_Interactions_exact_negative_samples(self, ratings_matrix_for_interactions):
        interactions = Interactions(mat=ratings_matrix_for_interactions,
                                    num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                    max_number_of_samples_to_consider=200,
                                    seed=42)

        assert interactions.positive_items != {}

        all_negative_samples = list()
        for _ in range(10):
            _, negative_samples = interactions[0]

            assert len(negative_samples) == NUM_NEGATIVE_SAMPLES

            for negative_sample in negative_samples:
                assert negative_sample.item() not in ratings_matrix_for_interactions[0].nonzero()[0]

            all_negative_samples += negative_samples.tolist()

        assert len(set(all_negative_samples)) > NUM_NEGATIVE_SAMPLES
示例#17
0
    def test_Interactions_exact_negative_samples_many_users(self, ratings_matrix_for_interactions):
        interactions = Interactions(mat=ratings_matrix_for_interactions,
                                    num_negative_samples=NUM_NEGATIVE_SAMPLES,
                                    max_number_of_samples_to_consider=200,
                                    seed=42)

        assert interactions.positive_items != {}

        for _ in range(10):
            (user_ids, _), negative_samples = interactions[list(range(NUM_USERS_TO_GENERATE))]

            assert len(negative_samples) == NUM_USERS_TO_GENERATE

            for idx, user_id in enumerate(user_ids):
                assert len(negative_samples[idx]) == NUM_NEGATIVE_SAMPLES

                for negative_sample in negative_samples[idx]:
                    assert (
                        negative_sample.item()
                        not in ratings_matrix_for_interactions[user_id].nonzero()[0]
                    )
def test_stratified_split(interactions_to_split):
    train_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 4, 4],
            'item_id': [1, 2, 3, 4, 6, 8, 1, 2, 3, 4, 2, 4, 5],
            'rating': [2, 3, 4, 5, 3, 1, 1, 2, 4, 5, 5, 5, 4],
        })
    train_expected = Interactions(
        mat=coo_matrix(
            (
                train_expected_df['rating'],
                (train_expected_df['user_id'], train_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        check_num_negative_samples_is_valid=False,
    )

    validate_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 1, 2, 3, 4],
            'item_id': [7, 3, 2, 1, 2],
            'rating': [2, 3, 3, 1, 3],
        })
    validate_expected = Interactions(
        mat=coo_matrix(
            (
                validate_expected_df['rating'],
                (validate_expected_df['user_id'],
                 validate_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        check_num_negative_samples_is_valid=False,
    )

    test_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 1, 2, 3, 4],
            'item_id': [0, 5, 4, 1, 4, 1],
            'rating': [1, 4, 4, 2, 4, 2],
        })
    test_expected = Interactions(
        mat=coo_matrix(
            (
                test_expected_df['rating'],
                (test_expected_df['user_id'], test_expected_df['item_id']),
            ),
            shape=(interactions_to_split.num_users,
                   interactions_to_split.num_items),
        ),
        allow_missing_ids=True,
        check_num_negative_samples_is_valid=False,
    )

    (train_actual, validate_actual,
     test_actual) = stratified_split(interactions=interactions_to_split,
                                     val_p=0.1,
                                     test_p=0.2,
                                     seed=46)

    np.testing.assert_array_equal(train_actual.toarray(),
                                  train_expected.toarray())
    np.testing.assert_array_equal(validate_actual.toarray(),
                                  validate_expected.toarray())
    np.testing.assert_array_equal(test_actual.toarray(),
                                  test_expected.toarray())

    assert (train_actual.num_users == train_expected.num_users ==
            validate_actual.num_users == validate_expected.num_users ==
            test_actual.num_users == test_expected.num_users)

    assert (train_actual.num_items == train_expected.num_items ==
            validate_actual.num_items == validate_expected.num_items ==
            test_actual.num_items == test_expected.num_items)
def interactions_sparse_matrix(sparse_ratings_matrix_for_interactions):
    return Interactions(mat=sparse_ratings_matrix_for_interactions,
                        check_num_negative_samples_is_valid=False)
def interactions_pandas(df_for_interactions):
    return Interactions(users=df_for_interactions['user_id'],
                        items=df_for_interactions['item_id'],
                        ratings=df_for_interactions['ratings'],
                        check_num_negative_samples_is_valid=False)
示例#21
0
 def test_ratings_None_but_its_okay(self, df_for_interactions):
     Interactions(users=df_for_interactions['user_id'],
                  items=df_for_interactions['item_id'],
                  ratings=None,
                  check_num_negative_samples_is_valid=False)
示例#22
0
 def test_users_None(self, df_for_interactions):
     with pytest.raises(AssertionError):
         Interactions(users=None,
                      items=df_for_interactions['item_id'],
                      ratings=df_for_interactions['ratings'],
                      check_num_negative_samples_is_valid=False)
示例#23
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie_recs/movielens/run.py  --epochs 20

    Parameters
    -------------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(
        model=model,
        gpus=gpus,
        max_epochs=epochs,
        deterministic=True,
        logger=False,
        checkpoint_callback=False,
        callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
        weights_summary='full',
        terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk],
                                                           test,
                                                           model,
                                                           k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')