示例#1
0
    def test_b_1_lfm_hybrid(self):
        self._setup_obs_handler()

        from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender
        lfm_rec = LightFMRecommender(external_features=self.state.train_obs.get_item_features(), no_components=50)
        lfm_rec.fit(self.state.train_obs, epochs=20)
        self._test_recommender(lfm_rec)
示例#2
0
    def test_b_1_lfm_recommender(self):
        self._setup_obs_handler()

        from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender
        lfm_rec = LightFMRecommender()
        lfm_rec.fit(self.state.train_obs, epochs=20)
        self.assertEqual(lfm_rec.fit_params['epochs'], 20)
        self._test_recommender(lfm_rec)
        # self._test_predict_for_user(lfm_rec)
        self.state.lfm_rec = lfm_rec
"""
This is an example on datasets-1M demonstrating:
    - More advanced fitting features: fit, evaluation, early stopping, hyper-param search
"""

from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
import pandas as pd
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()
ratings_df = pd.read_csv(rating_csv_path)

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid')
train_obs, test_obs = obs.split_train_test(ratio=0.2)
lfm_rec = LightFMRecommender()

# train LightFM with early stopping and print evaluation results
lfm_rec.fit_with_early_stop(train_obs,
                            epochs_max=30,
                            epochs_step=1,
                            stop_patience=1,
                            valid_ratio=0.2,
                            metric='n-MRR@10',
                            refit_on_all=True)
print(
    lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm early stop '))

# perform a hyperparameter search on LightFM recommender
space = lfm_rec.guess_search_space()
hp_results = lfm_rec.hyper_param_search(
示例#4
0
from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()

# read the interactions dataframe and create a data handler object and  split to train and test
import pandas as pd

ratings_df = pd.read_csv(rating_csv_path)
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid')
train_obs, test_obs = obs.split_train_test(ratio=0.2)

# train and test LightFM recommender
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

lfm_rec = LightFMRecommender()
lfm_rec.fit(train_obs, epochs=10)

# print summary evaluation report:
print(
    lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm ', n_rec=100))

# get all recommendations and print a sample (training interactions are filtered out by default)
recs = lfm_rec.get_recommendations(lfm_rec.all_users, n_rec=5)
print(recs.sample(5))

# get all similarities and print a sample
simils = lfm_rec.get_similar_items(lfm_rec.all_items, n_simil=5)
print(simils.sample(10))
    # by item popularity- popular and unpopular items
    item_hist_counts = train_df.itemid.value_counts()
    item_hist_counts.hist(bins=100, alpha=0.5)
    popular_items = item_hist_counts[item_hist_counts >= 1000].index.tolist()
    test_df_pop_movies = test_df[test_df.itemid.isin(popular_items)]
    test_df_nonpop_movies = test_df[~test_df.itemid.isin(popular_items)]

    test_dfs = [
        test_df, test_df_act_us, test_df_nonact_us, test_df_pop_movies,
        test_df_nonpop_movies
    ]
    test_names = [
        'all ', 'active users ', 'inactive users ', 'popular movies ',
        'unpopular movies '
    ]
    df_lens = [len(t) for t in test_dfs]
    print('Test DFs counts: ' + str(list(zip(test_names, df_lens))))
    return test_dfs, test_names


test_dfs, test_names = construct_multiple_test_sets(test_df=test_obs.df_obs,
                                                    train_df=train_obs.df_obs)

# evaluation
lfm_rec = LightFMRecommender()
lfm_rec.fit(train_obs, epochs=10)
print(
    lfm_rec.eval_on_test_by_ranking(test_dfs=test_dfs,
                                    test_names=test_names,
                                    prefix='lfm regular '))
"""

from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
import pandas as pd
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()
ratings_df = pd.read_csv(rating_csv_path)

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid')
train_obs, test_obs = obs.split_train_test(ratio=0.2)

# train and test LightFM recommender
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

lfm_rec = LightFMRecommender()
lfm_rec.fit(train_obs, epochs=10)

# train and evaluate a Cooccurrence recommender (fast and deterministic)
from ml_recsys_tools.recommenders.cooccurrence_recommenders import ItemCoocRecommender

item_cooc_rec = ItemCoocRecommender()
item_cooc_rec.fit(train_obs)
print(
    item_cooc_rec.eval_on_test_by_ranking(test_obs,
                                          prefix='item cooccurrence '))

# combine LightFM and Cooccurrence recommenders
# using recommendation ranks, and evaluate
from ml_recsys_tools.recommenders.combination_ensembles import CombinedRankEnsemble
# read the interactions dataframe and create a data handler object and  split to train and test
import pandas as pd
ratings_df = pd.read_csv(rating_csv_path)
movies_df = pd.read_csv(movies_csv_path)

from ml_recsys_tools.data_handlers.interactions_with_features import ObsWithFeatures

obs = ObsWithFeatures(df_obs=ratings_df, df_items=movies_df,
                      uid_col='userid', iid_col='itemid', item_id_col='itemid')
train_obs, test_obs = obs.split_train_test(ratio=0.2)

# compare LightFM recommenders
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

# no features - just CF
cf_only = LightFMRecommender()
cf_only.fit(train_obs, epochs=20)
print(cf_only.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm ', n_rec=100))


# using movie genres and CF (hybrid mode) - slightly better
feature_columns = list(movies_df.columns.difference(['itemid']))
hybrid = LightFMRecommender(external_features=train_obs.get_item_features(bin_cols=feature_columns))
hybrid.fit(train_obs, epochs=20)
print(hybrid.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm hybrid ', n_rec=100))


# using only genres - much worse than both - but still better than chance
feature_columns = list(movies_df.columns.difference(['item_ind', 'itemid']))
only_feat = LightFMRecommender(external_features=train_obs.get_item_features(bin_cols=feature_columns),
                            external_features_params=dict(add_identity_mat=False))
示例#8
0
Example explaining the peculiriaties of evaluation
"""

from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
import pandas as pd
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()
ratings_df = pd.read_csv(rating_csv_path)

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid')
train_obs, test_obs = obs.split_train_test(ratio=0.2)

# train and test LightFM recommender
lfm_rec = LightFMRecommender()
lfm_rec.fit(train_obs, epochs=10)

# print evaluation results:
# for LightFM there is an exact method that on large and sparse
# data might be too slow (for this data it's much faster though)
print(
    lfm_rec.eval_on_test_by_ranking_exact(test_obs.df_obs,
                                          prefix='lfm regular exact '))

# this ranking evaluation is done by sampling top n_rec recommendations
# rather than all ranks for all items (very slow and memory-wise expensive for large data).
# choosing higher values for n_rec makes
# the evaluation more accurate (less pessimmistic)
# this way the evaluation is mostly accurate for the top results,
# and is quite pessimmistic (especially for AUC, which scores for all ranks) and any non @k metric