def test_b_1_lfm_hybrid(self): self._setup_obs_handler() from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender lfm_rec = LightFMRecommender(external_features=self.state.train_obs.get_item_features(), no_components=50) lfm_rec.fit(self.state.train_obs, epochs=20) self._test_recommender(lfm_rec)
def test_b_1_lfm_recommender(self): self._setup_obs_handler() from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender lfm_rec = LightFMRecommender() lfm_rec.fit(self.state.train_obs, epochs=20) self.assertEqual(lfm_rec.fit_params['epochs'], 20) self._test_recommender(lfm_rec) # self._test_predict_for_user(lfm_rec) self.state.lfm_rec = lfm_rec
""" This is an example on datasets-1M demonstrating: - More advanced fitting features: fit, evaluation, early stopping, hyper-param search """ from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data import pandas as pd from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid') train_obs, test_obs = obs.split_train_test(ratio=0.2) lfm_rec = LightFMRecommender() # train LightFM with early stopping and print evaluation results lfm_rec.fit_with_early_stop(train_obs, epochs_max=30, epochs_step=1, stop_patience=1, valid_ratio=0.2, metric='n-MRR@10', refit_on_all=True) print( lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm early stop ')) # perform a hyperparameter search on LightFM recommender space = lfm_rec.guess_search_space() hp_results = lfm_rec.hyper_param_search(
from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() # read the interactions dataframe and create a data handler object and split to train and test import pandas as pd ratings_df = pd.read_csv(rating_csv_path) from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid') train_obs, test_obs = obs.split_train_test(ratio=0.2) # train and test LightFM recommender from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender lfm_rec = LightFMRecommender() lfm_rec.fit(train_obs, epochs=10) # print summary evaluation report: print( lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm ', n_rec=100)) # get all recommendations and print a sample (training interactions are filtered out by default) recs = lfm_rec.get_recommendations(lfm_rec.all_users, n_rec=5) print(recs.sample(5)) # get all similarities and print a sample simils = lfm_rec.get_similar_items(lfm_rec.all_items, n_simil=5) print(simils.sample(10))
# by item popularity- popular and unpopular items item_hist_counts = train_df.itemid.value_counts() item_hist_counts.hist(bins=100, alpha=0.5) popular_items = item_hist_counts[item_hist_counts >= 1000].index.tolist() test_df_pop_movies = test_df[test_df.itemid.isin(popular_items)] test_df_nonpop_movies = test_df[~test_df.itemid.isin(popular_items)] test_dfs = [ test_df, test_df_act_us, test_df_nonact_us, test_df_pop_movies, test_df_nonpop_movies ] test_names = [ 'all ', 'active users ', 'inactive users ', 'popular movies ', 'unpopular movies ' ] df_lens = [len(t) for t in test_dfs] print('Test DFs counts: ' + str(list(zip(test_names, df_lens)))) return test_dfs, test_names test_dfs, test_names = construct_multiple_test_sets(test_df=test_obs.df_obs, train_df=train_obs.df_obs) # evaluation lfm_rec = LightFMRecommender() lfm_rec.fit(train_obs, epochs=10) print( lfm_rec.eval_on_test_by_ranking(test_dfs=test_dfs, test_names=test_names, prefix='lfm regular '))
""" from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data import pandas as pd from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid') train_obs, test_obs = obs.split_train_test(ratio=0.2) # train and test LightFM recommender from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender lfm_rec = LightFMRecommender() lfm_rec.fit(train_obs, epochs=10) # train and evaluate a Cooccurrence recommender (fast and deterministic) from ml_recsys_tools.recommenders.cooccurrence_recommenders import ItemCoocRecommender item_cooc_rec = ItemCoocRecommender() item_cooc_rec.fit(train_obs) print( item_cooc_rec.eval_on_test_by_ranking(test_obs, prefix='item cooccurrence ')) # combine LightFM and Cooccurrence recommenders # using recommendation ranks, and evaluate from ml_recsys_tools.recommenders.combination_ensembles import CombinedRankEnsemble
# read the interactions dataframe and create a data handler object and split to train and test import pandas as pd ratings_df = pd.read_csv(rating_csv_path) movies_df = pd.read_csv(movies_csv_path) from ml_recsys_tools.data_handlers.interactions_with_features import ObsWithFeatures obs = ObsWithFeatures(df_obs=ratings_df, df_items=movies_df, uid_col='userid', iid_col='itemid', item_id_col='itemid') train_obs, test_obs = obs.split_train_test(ratio=0.2) # compare LightFM recommenders from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender # no features - just CF cf_only = LightFMRecommender() cf_only.fit(train_obs, epochs=20) print(cf_only.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm ', n_rec=100)) # using movie genres and CF (hybrid mode) - slightly better feature_columns = list(movies_df.columns.difference(['itemid'])) hybrid = LightFMRecommender(external_features=train_obs.get_item_features(bin_cols=feature_columns)) hybrid.fit(train_obs, epochs=20) print(hybrid.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm hybrid ', n_rec=100)) # using only genres - much worse than both - but still better than chance feature_columns = list(movies_df.columns.difference(['item_ind', 'itemid'])) only_feat = LightFMRecommender(external_features=train_obs.get_item_features(bin_cols=feature_columns), external_features_params=dict(add_identity_mat=False))
Example explaining the peculiriaties of evaluation """ from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data import pandas as pd from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid') train_obs, test_obs = obs.split_train_test(ratio=0.2) # train and test LightFM recommender lfm_rec = LightFMRecommender() lfm_rec.fit(train_obs, epochs=10) # print evaluation results: # for LightFM there is an exact method that on large and sparse # data might be too slow (for this data it's much faster though) print( lfm_rec.eval_on_test_by_ranking_exact(test_obs.df_obs, prefix='lfm regular exact ')) # this ranking evaluation is done by sampling top n_rec recommendations # rather than all ranks for all items (very slow and memory-wise expensive for large data). # choosing higher values for n_rec makes # the evaluation more accurate (less pessimmistic) # this way the evaluation is mostly accurate for the top results, # and is quite pessimmistic (especially for AUC, which scores for all ranks) and any non @k metric