Пример #1
0
    def test_data(self):
        from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
        rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data(
            movielens_dir)

        ratings_df = pd.read_csv(rating_csv_path)
        self.assertListEqual(list(ratings_df.columns),
                             ['rating', 'timestamp', 'itemid', 'userid'])
        self.assertEqual(len(ratings_df), 1000209)

        users_df = pd.read_csv(users_csv_path)
        self.assertListEqual(list(users_df.columns), [
            'user_ind', 'gender', 'age', 'occupation', 'zipcode', 'index',
            'occupation_name', 'userid'
        ])
        self.assertEqual(len(users_df), 6040)

        movies_df = pd.read_csv(movies_csv_path)
        self.assertListEqual(list(movies_df.columns),
                             ['item_ind', 'itemid', 'genres'])
        self.assertEqual(len(movies_df), 3883)

        from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
        obs = ObservationsDF(df_obs=ratings_df)
        info = obs.data_info()
        self.assertEqual(info['len'], 989539)
        self.assertEqual(info['n_unique_items'], 3706)
        self.assertEqual(info['n_unique_users'], 5796)
        self.assertEqual(info['ratings_20_pctl'], 3.0)
        self.assertEqual(info['ratings_80_pctl'], 5.0)
    def test_data(self):
        from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
        rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data(
            movielens_dir)

        ratings_df = pd.read_csv(rating_csv_path)
        self.assertListEqual(list(ratings_df.columns),
                             ['rating', 'timestamp', 'itemid', 'userid'])
        self.assertEqual(len(ratings_df), 1000209)

        users_df = pd.read_csv(users_csv_path)
        self.assertListEqual(list(users_df.columns), [
            'gender', 'age', 'occupation', 'zipcode', 'index',
            'occupation_name', 'userid'
        ])
        self.assertEqual(len(users_df), 6040)

        movies_df = pd.read_csv(movies_csv_path)
        self.assertSetEqual(
            set(movies_df.columns), {
                'itemid', 'Adventure', 'FilmNoir', 'Comedy', 'SciFi',
                'Fantasy', 'Crime', 'Mystery', 'Action', 'Thriller', 'Horror',
                'Musical', 'Drama', 'Western', 'War', 'Animation', 'Romance',
                'Childrens', 'Documentary'
            })
        self.assertEqual(len(movies_df), 3883)

        from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
        obs = ObservationsDF(df_obs=ratings_df)
        info = obs.data_info()
        self.assertEqual(info['len'], 989539)
        self.assertEqual(info['n_unique_items'], 3706)
        self.assertEqual(info['n_unique_users'], 5796)
        self.assertEqual(info['ratings_20_pctl'], 3.0)
        self.assertEqual(info['ratings_80_pctl'], 5.0)
"""
This is an example on datasets-1M demonstrating:
    - More advanced fitting features: fit, evaluation, early stopping, hyper-param search
"""

from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
import pandas as pd
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()
ratings_df = pd.read_csv(rating_csv_path)

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid')
train_obs, test_obs = obs.split_train_test(ratio=0.2)
lfm_rec = LightFMRecommender()

# train LightFM with early stopping and print evaluation results
lfm_rec.fit_with_early_stop(train_obs,
                            epochs_max=30,
                            epochs_step=1,
                            stop_patience=1,
                            valid_ratio=0.2,
                            metric='n-MRR@10',
                            refit_on_all=True)
print(
    lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm early stop '))

# perform a hyperparameter search on LightFM recommender
space = lfm_rec.guess_search_space()
hp_results = lfm_rec.hyper_param_search(
Пример #4
0
import pandas as pd
from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
from ml_recsys_tools.utils.testing import TestCaseWithState
from tests.test_movielens_data import movielens_dir

from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
from ml_recsys_tools.data_handlers.interactions_with_features import ObsWithFeatures

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data(
    movielens_dir)


class TestRecommendersBasic(TestCaseWithState):
    def _obs_split_data_check(self, obs_full, obs1, obs2):
        # all the data is still there
        self.assertEqual(
            len(obs1.df_obs) + len(obs2.df_obs), len(obs_full.df_obs))
        # no intersections
        intersections = pd.merge(obs1.df_obs,
                                 obs2.df_obs,
                                 on=['userid', 'itemid'],
                                 how='inner')
        self.assertEqual(len(intersections), 0)

    def _split_tester(self, obs):
        ratio = 0.2

        # regular split
        train_obs, test_obs = obs.split_train_test(ratio=ratio)
        self._obs_split_data_check(obs, train_obs, test_obs)
        self.state.train_obs, self.state.test_obs = train_obs, test_obs