def _setup_obs_handler(self): ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col=self.user_id_col, iid_col=self.item_id_col) obs = obs.sample_observations(n_users=1000, n_items=1000) self.state.train_obs, self.state.test_obs = obs.split_train_test(ratio=0.2, users_ratio=1.0) # add some fake data for sanity tests self.state.train_obs.df_obs = self._add_testing_obs_data(self.state.train_obs.df_obs)
def test_data(self): from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data( movielens_dir) ratings_df = pd.read_csv(rating_csv_path) self.assertListEqual(list(ratings_df.columns), ['rating', 'timestamp', 'itemid', 'userid']) self.assertEqual(len(ratings_df), 1000209) users_df = pd.read_csv(users_csv_path) self.assertListEqual(list(users_df.columns), [ 'user_ind', 'gender', 'age', 'occupation', 'zipcode', 'index', 'occupation_name', 'userid' ]) self.assertEqual(len(users_df), 6040) movies_df = pd.read_csv(movies_csv_path) self.assertListEqual(list(movies_df.columns), ['item_ind', 'itemid', 'genres']) self.assertEqual(len(movies_df), 3883) from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF obs = ObservationsDF(df_obs=ratings_df) info = obs.data_info() self.assertEqual(info['len'], 989539) self.assertEqual(info['n_unique_items'], 3706) self.assertEqual(info['n_unique_users'], 5796) self.assertEqual(info['ratings_20_pctl'], 3.0) self.assertEqual(info['ratings_80_pctl'], 5.0)
def test_data(self): from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data( movielens_dir) ratings_df = pd.read_csv(rating_csv_path) self.assertListEqual(list(ratings_df.columns), ['rating', 'timestamp', 'itemid', 'userid']) self.assertEqual(len(ratings_df), 1000209) users_df = pd.read_csv(users_csv_path) self.assertListEqual(list(users_df.columns), [ 'gender', 'age', 'occupation', 'zipcode', 'index', 'occupation_name', 'userid' ]) self.assertEqual(len(users_df), 6040) movies_df = pd.read_csv(movies_csv_path) self.assertSetEqual( set(movies_df.columns), { 'itemid', 'Adventure', 'FilmNoir', 'Comedy', 'SciFi', 'Fantasy', 'Crime', 'Mystery', 'Action', 'Thriller', 'Horror', 'Musical', 'Drama', 'Western', 'War', 'Animation', 'Romance', 'Childrens', 'Documentary' }) self.assertEqual(len(movies_df), 3883) from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF obs = ObservationsDF(df_obs=ratings_df) info = obs.data_info() self.assertEqual(info['len'], 989539) self.assertEqual(info['n_unique_items'], 3706) self.assertEqual(info['n_unique_users'], 5796) self.assertEqual(info['ratings_20_pctl'], 3.0) self.assertEqual(info['ratings_80_pctl'], 5.0)
def fit(self, train_obs: ObservationsDF, *args, **kwargs): factors_obs, reg_obs = train_obs.split_train_test( ratio=self.stacking_split) self._set_item_features_df(train_obs) self._set_data(factors_obs) self._fit_factorizer(factors_obs) self._fit_regressor(reg_obs)
def test_splits(self): ratings_df = pd.read_csv(rating_csv_path) obs_params = dict(uid_col='userid', iid_col='itemid', timestamp_col='timestamp') obs = ObservationsDF(ratings_df, **obs_params) obs = obs.sample_observations(n_users=1000, n_items=1000) self._split_tester(obs) items_df = pd.read_csv(movies_csv_path) obs_feat = ObsWithFeatures(df_obs=ratings_df, df_items=items_df, item_id_col='itemid', **obs_params) obs_feat = obs_feat.sample_observations(n_users=1000, n_items=1000) self._split_tester(obs_feat)
def test_splits(self): from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid', timestamp_col='timestamp') obs = obs.sample_observations(n_users=1000, n_items=1000) ratio = 0.2 # regular split train_obs, test_obs = obs.split_train_test(ratio=ratio) self._obs_split_data_check(obs, train_obs, test_obs) self.state.train_obs, self.state.test_obs = train_obs, test_obs # split for only some users user_ratio = 0.2 train_obs, test_obs = obs.split_train_test(ratio=ratio, users_ratio=user_ratio) self._obs_split_data_check(obs, train_obs, test_obs) post_split_ratio = test_obs.df_obs['userid'].nunique( ) / train_obs.df_obs['userid'].nunique() self.assertAlmostEqual(user_ratio, post_split_ratio, places=1) # split by timestamp time_col = obs.timestamp_col train_obs, test_obs = obs.split_train_test(ratio=ratio, time_split_column=time_col) self._obs_split_data_check(obs, train_obs, test_obs) self.assertGreaterEqual(test_obs.df_obs[time_col].min(), train_obs.df_obs[time_col].max())
""" This is an example on datasets-1M demonstrating: - More advanced fitting features: fit, evaluation, early stopping, hyper-param search """ from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data import pandas as pd from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid') train_obs, test_obs = obs.split_train_test(ratio=0.2) lfm_rec = LightFMRecommender() # train LightFM with early stopping and print evaluation results lfm_rec.fit_with_early_stop(train_obs, epochs_max=30, epochs_step=1, stop_patience=1, valid_ratio=0.2, metric='n-MRR@10', refit_on_all=True) print( lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm early stop ')) # perform a hyperparameter search on LightFM recommender space = lfm_rec.guess_search_space() hp_results = lfm_rec.hyper_param_search(
""" This is an example on datasets-1M demonstrating recommenders from spotlight library """ from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data import pandas as pd from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid', timestamp_col='timestamp') train_obs, test_obs = obs.split_train_test(ratio=0.2, time_split_column=obs.timestamp_col) from ml_recsys_tools.recommenders.spotlight_recommenders import EmbeddingFactorsRecommender emb_rec = EmbeddingFactorsRecommender(model_params=dict(loss='adaptive_hinge', n_iter=1)) # emb_rec.fit(train_obs) emb_rec.fit_with_early_stop(train_obs, epochs_max=5, epochs_step=1) print(emb_rec.eval_on_test_by_ranking(test_obs, prefix='implicit embeddings ')) # trying to reproduce this: # https://github.com/maciejkula/spotlight/tree/master/examples/movielens_sequence from ml_recsys_tools.recommenders.spotlight_recommenders import SequenceEmbeddingRecommender seq_rec = SequenceEmbeddingRecommender( model_params=dict(n_iter=15, embedding_dim=32, batch_size=32, learning_rate=0.01), fit_params=dict(max_sequence_length=200, timestamp_col='timestamp')) seq_rec.fit(train_obs) # emb_rec.fit_with_early_stop(train_obs, epochs_max=30, epochs_step=3) print(seq_rec.eval_on_test_by_ranking(test_obs, prefix='lstm ', include_train=False))
""" using multiple test sets """ # dataset: download and prepare dataframes import pandas as pd from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() # read the interactions dataframe and create a data handler object and split to train and test ratings_df = pd.read_csv(rating_csv_path) from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF obs = ObservationsDF(ratings_df) train_obs, test_obs = obs.split_train_test(ratio=0.2, users_ratio=0.2) def construct_multiple_test_sets(test_df, train_df): # by user history - active and inactive users user_hist_counts = train_df.userid.value_counts() user_hist_counts.hist(bins=100, alpha=0.5) active_users = user_hist_counts[user_hist_counts >= 300].index.tolist() test_df_act_us = test_df[test_df.userid.isin(active_users)] test_df_nonact_us = test_df[~test_df.userid.isin(active_users)] # by item popularity- popular and unpopular items item_hist_counts = train_df.itemid.value_counts() item_hist_counts.hist(bins=100, alpha=0.5) popular_items = item_hist_counts[item_hist_counts >= 1000].index.tolist()