def get_ICM_all(reader: RecSys2019Reader): """ It returns all the ICM_all after applying feature engineering :param reader: data splitter :return: return ICM_all """ URM_all = reader.get_URM_all() UCM_all_dict = reader.get_loaded_UCM_dict() ICM_all_dict = reader.get_loaded_ICM_dict() ICM_all_dict.pop("ICM_all") ICM_all_dict = apply_feature_engineering_ICM( ICM_all_dict, URM_all, UCM_all_dict, ICM_names_to_count=["ICM_sub_class"], UCM_names_to_list=["UCM_age"]) ICM_all_dict = apply_filtering_ICM( ICM_all_dict, ICM_name_to_filter_mapper={ "ICM_asset": lambda x: x < np.quantile(x, q=0.75) + 0.72 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)), "ICM_price": lambda x: x < np.quantile(x, q=0.75) + 4 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)) }) ICM_all_dict = apply_transformation_ICM(ICM_all_dict, ICM_name_to_transform_mapper={ "ICM_asset": lambda x: np.log1p(1 / x), "ICM_price": lambda x: np.log1p(1 / x), "ICM_item_pop": np.log1p, "ICM_sub_class_count": np.log1p, "ICM_age": lambda x: x**(1 / 2.5) }) ICM_all_dict = apply_discretization_ICM(ICM_all_dict, ICM_name_to_bins_mapper={ "ICM_asset": 200, "ICM_price": 200, "ICM_item_pop": 50, "ICM_sub_class_count": 50 }) ICM_all = build_ICM_all_from_dict(ICM_all_dict) return ICM_all
def main(): # Data loading root_data_path = os.path.join(get_project_root_path(), "data/") data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out( data_reader, k_out_value=K_OUT, use_validation_set=False, allow_cold_users=ALLOW_COLD_USERS, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() ICM_all, _ = get_ICM_train_new(data_reader) UCM_all, _ = get_UCM_train_new(data_reader) # Ignoring users ignore_users = get_ignore_users( URM_train, data_reader.get_original_user_id_to_index_mapper(), lower_threshold=LOWER_THRESHOLD, upper_threshold=UPPER_THRESHOLD, ignore_non_target_users=IGNORE_NON_TARGET_USERS) evaluator = EvaluatorHoldout(URM_test, cutoff_list=[CUTOFF], ignore_users=ignore_users) # Model evaluation model = get_model(URM_train, ICM_all, UCM_all) print(evaluator.evaluateRecommender(model))
def get_UCM_all(reader: RecSys2019Reader): URM_all = reader.get_URM_all() UCM_all_dict = reader.get_loaded_UCM_dict() ICM_dict = reader.get_loaded_ICM_dict() UCM_all_dict = apply_feature_engineering_UCM( UCM_all_dict, URM_all, ICM_dict, ICM_names_to_UCM=["ICM_sub_class"]) # These are useful feature weighting for UserCBF_CF_Warm UCM_all_dict = apply_transformation_UCM(UCM_all_dict, UCM_name_to_transform_mapper={ "UCM_sub_class": lambda x: x / 2, "UCM_user_act": np.log1p }) UCM_all_dict = apply_discretization_UCM( UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50}) UCM_all = build_UCM_all_from_dict(UCM_all_dict) return UCM_all
def main(): args = get_arguments() # Data loading root_data_path = args.reader_path data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=K_OUT, allow_cold_users=ALLOW_COLD_USERS, use_validation_set=False, force_new_split=True, seed=args.seed) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() # Remove interactions to users that has len == 1 to URM_train len_1_users_mask = np.ediff1d(URM_train.tocsr().indptr) == 1 len_1_users = np.arange(URM_train.shape[0])[len_1_users_mask] URM_train = URM_train.tolil() URM_train[len_1_users, :] = 0 URM_train = URM_train.tocsr() # Remove interactions to users that has len == 1 to URM_test len_1_users_mask = np.ediff1d(URM_test.tocsr().indptr) == 1 len_1_users = np.arange(URM_test.shape[0])[len_1_users_mask] URM_test = URM_test.tolil() URM_test[len_1_users, :] = 0 URM_test = URM_test.tocsr() UCM_all = get_UCM_train_cold(data_reader) ignore_users = get_ignore_users(URM_train, data_reader.get_original_user_id_to_index_mapper(), lower_threshold=args.lower_threshold, upper_threshold=args.upper_threshold, ignore_non_target_users=args.exclude_non_target) ignore_users = np.concatenate([ignore_users, len_1_users]) # Setting evaluator cutoff_list = [10] evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list, ignore_users=ignore_users) # HP tuning print("Start tuning...") version_path = "../../report/hp_tuning/{}/".format(args.recommender_name) now = datetime.now().strftime('%b%d_%H-%M-%S') now = now + "_k_out_value_{}/".format(K_OUT) version_path = version_path + "/" + now run_parameter_search_user_demographic(URM_train=URM_train, UCM_object=UCM_all, UCM_name="UCM_all", recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name], evaluator_validation=evaluator, metric_to_optimize="MAP", output_folder_path=version_path, parallelizeKNN=True, n_cases=int(args.n_cases), n_random_starts=int(args.n_random_starts)) print("...tuning ended")
def read_split_load_data(k_out, allow_cold_users, seed): root_data_path = os.path.join(get_project_root_path(), "data/") data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out( data_reader, k_out_value=k_out, use_validation_set=False, allow_cold_users=allow_cold_users, force_new_split=True, seed=seed) data_reader.load_data() return data_reader
def get_ICM_all_new(reader: RecSys2019Reader): """ It returns all the ICM_all after applying feature engineering :param reader: data splitter :return: return ICM_all """ ICM_all_dict = reader.get_loaded_ICM_dict() ICM_all_dict.pop("ICM_all") ICM_all_dict = apply_filtering_ICM( ICM_all_dict, ICM_name_to_filter_mapper={ "ICM_asset": lambda x: x < np.quantile(x, q=0.75) + 0.72 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)), "ICM_price": lambda x: x < np.quantile(x, q=0.75) + 4 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)) }) # Apply useful transformation ICM_all_dict = apply_transformation_ICM(ICM_all_dict, ICM_name_to_transform_mapper={ "ICM_asset": lambda x: np.log1p(1 / x), "ICM_price": lambda x: np.log1p(1 / x), "ICM_item_pop": np.log1p }) ICM_all_dict = apply_discretization_ICM(ICM_all_dict, ICM_name_to_bins_mapper={ "ICM_asset": 200, "ICM_price": 200, "ICM_item_pop": 50 }) # Apply feature weighting ICM_all_dict = apply_transformation_ICM( ICM_all_dict, ICM_name_to_transform_mapper={ "ICM_price": lambda x: x * 1.8474248499810804, "ICM_asset": lambda x: x * 1.2232716972721878, "ICM_sub_class": lambda x: x * 1.662671860026709, "ICM_item_pop": lambda x: x * 0.886528360392298 }) ICM_all = build_ICM_all_from_dict(ICM_all_dict) return ICM_all
def main(): args = get_arguments() # Data loading data_reader = RecSys2019Reader(args.reader_path) data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True, seed=args.seed) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() if args.recommender_name == "sslim_bpr": ICM_all = get_ICM_train(data_reader) URM_train = sps.vstack([URM_train, ICM_all.T], format="csr") if args.recommender_name == "rp3beta_side": ICM_all = get_ICM_train(data_reader) URM_train = sps.vstack([URM_train, ICM_all.T], format="csr") URM_train = TF_IDF(URM_train).tocsr() if args.recommender_name == "pure_svd": URM_train = TF_IDF(URM_train).tocsr() if args.recommender_name == "pure_svd_side": ICM_all = get_ICM_train(data_reader) URM_train = sps.vstack([URM_train, ICM_all.T], format="csr") # Setting evaluator exclude_cold_users = args.exclude_users h = int(args.focus_on_high) fol = int(args.focus_on_low) if h != 0: print("Excluding users with less than {} interactions".format(h)) ignore_users_mask = np.ediff1d(URM_train.tocsr().indptr) < h ignore_users = np.arange(URM_train.shape[0])[ignore_users_mask] elif fol != 0: print("Excluding users with more than {} interactions".format(fol)) warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > fol ignore_users = np.arange(URM_train.shape[0])[warm_users_mask] if exclude_cold_users: cold_user_mask = np.ediff1d(URM_train.tocsr().indptr) == 0 cold_users = np.arange(URM_train.shape[0])[cold_user_mask] ignore_users = np.unique(np.concatenate((cold_users, ignore_users))) elif exclude_cold_users: print("Excluding cold users...") cold_user_mask = np.ediff1d(URM_train.tocsr().indptr) == 0 ignore_users = np.arange(URM_train.shape[0])[cold_user_mask] else: ignore_users = None cutoff_list = [10] evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list, ignore_users=ignore_users) # HP tuning print("Start tuning...") version_path = "../../report/hp_tuning/{}/".format(args.recommender_name) now = datetime.now().strftime('%b%d_%H-%M-%S') now = now + "_k_out_value_3/" version_path = version_path + "/" + now runParameterSearch_Collaborative(URM_train=URM_train, recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name], evaluator_validation=evaluator, metric_to_optimize="MAP", output_folder_path=version_path, n_cases=int(args.n_cases), n_random_starts=int(args.n_random_starts)) print("...tuning ended")
from datetime import datetime from course_lib.Base.Evaluation.Evaluator import EvaluatorHoldout from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out from src.data_management.RecSys2019Reader import RecSys2019Reader from src.data_management.dataframe_preprocesser import get_preprocessed_dataframe from src.tuning.holdout_validation.run_parameter_search_advanced_top_pop import run_parameter_search_advanced_top_pop from src.utils.general_utility_functions import get_split_seed if __name__ == '__main__': # Data loading data_reader = RecSys2019Reader("../../data/") data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() mapper = data_reader.get_original_user_id_to_index_mapper() df = get_preprocessed_dataframe("../../data/", keep_warm_only=True) # Setting evaluator # warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > 0 # warm_users = np.arange(URM_train.shape[0])[warm_users_mask] # ignore_users = warm_users cutoff_list = [10] evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list) # HP tuning print("Start tuning...")
from course_lib.Base.NonPersonalizedRecommender import TopPop from scripts.fm_model.write_ffm_data_uncompressed import get_ICM_with_fields, get_UCM_with_fields from scripts.scripts_utils import set_env_variables from src.data_management.DataPreprocessing import DataPreprocessingRemoveColdUsersItems from src.data_management.New_DataSplitter_leave_k_out import * from src.data_management.RecSys2019Reader import RecSys2019Reader from src.model import new_best_models from src.model.FactorizationMachine.FieldAwareFMRecommender import FieldAwareFMRecommender from src.utils.general_utility_functions import get_split_seed, get_project_root_path if __name__ == '__main__': set_env_variables() # Data loading root_data_path = "../data/" data_reader = RecSys2019Reader(root_data_path) data_reader = DataPreprocessingRemoveColdUsersItems(data_reader, threshold_users=25, threshold_items=20) data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=1, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() # Build ICMs ICM_all, item_feature_fields = get_ICM_with_fields(data_reader) # Build UCMs: do not change the order of ICMs and UCMs
from src.data_management.RecSys2019Reader import RecSys2019Reader from course_lib.Data_manager.DataSplitter_k_fold import DataSplitter_Warm_k_fold from course_lib.Base.Evaluation.Evaluator import * from course_lib.ParameterTuning.run_parameter_search import * from course_lib.Notebooks_utils.data_splitter import train_test_holdout if __name__ == '__main__': # Data loading dataset = RecSys2019Reader("../data/train.csv", "../data/tracks.csv") dataset = DataSplitter_Warm_k_fold(dataset, n_folds=10) dataset.load_data() URM_train, URM_test = dataset.get_URM_train_for_test_fold(n_test_fold=8) # Hyperparameter tuning cutoff_list = [10] evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list) fake_URM_train, subset_URM_test = train_test_holdout(URM_test, train_perc=0.95) evaluator_early_stopping = EvaluatorHoldout(subset_URM_test, cutoff_list=cutoff_list) # Sarebbe meglio un subset della matrice per fare early stopping, in modo che vada molto più velcoe... print("Start tuning...") runParameterSearch_Collaborative( URM_train=URM_train, recommender_class=SLIM_BPR_Cython, evaluator_validation=evaluator, evaluator_validation_earlystopping=evaluator_early_stopping, metric_to_optimize="MAP",
from src.data_management.New_DataSplitter_leave_k_out import * from src.data_management.RecSys2019Reader import RecSys2019Reader from src.data_management.RecSys2019Reader_utils import get_ICM_numerical, merge_UCM from src.data_management.data_getter import get_warmer_UCM from src.model.FallbackRecommender.AdvancedTopPopular import AdvancedTopPopular from src.plots.recommender_plots import * from src.data_management.dataframe_preprocesser import get_preprocessed_dataframe from src.model import best_models from src.utils.general_utility_functions import get_split_seed if __name__ == '__main__': # Data reading data_reader = RecSys2019Reader() data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() mapper = data_reader.SPLIT_GLOBAL_MAPPER_DICT['user_original_ID_to_index'] df = get_preprocessed_dataframe("../../data/", keep_warm_only=True) # Build ICMs ICM_numerical, _ = get_ICM_numerical(data_reader.dataReader_object) ICM_categorical = data_reader.get_ICM_from_name("ICM_sub_class") # Build UCMs URM_all = data_reader.dataReader_object.get_URM_all() UCM_age = data_reader.dataReader_object.get_UCM_from_name("UCM_age")
from course_lib.KNN.UserKNNCFRecommender import UserKNNCFRecommender from src.data_management.RecSys2019Reader import RecSys2019Reader from course_lib.Data_manager.DataSplitter_k_fold import DataSplitter_Warm_k_fold if __name__ == '__main__': dataset = RecSys2019Reader("../../data/") dataset = DataSplitter_Warm_k_fold(dataset, n_folds=10) dataset.load_data() URM_train, URM_test = dataset.get_URM_train_for_test_fold(n_test_fold=9) model = UserKNNCFRecommender(URM_train) model.fit() print("The recommendation for user 1 is: {}".format( model.recommend(1, cutoff=10)))
# + {"pycharm": {"name": "#%%\n", "is_executing": false}} df_target.head() # + {"pycharm": {"name": "#%%\n", "is_executing": false}} target_users = df_target.user_id.values target_users # + {"pycharm": {"name": "#%%\n", "is_executing": false}} print("There are %d users in the target users" % len(target_users)) # - # ## Analyze target users w.r.t. URM # + {"pycharm": {"name": "#%%\n", "is_executing": false}} dataset = RecSys2019Reader() dataset.load_data() # + {"pycharm": {"name": "#%%\n", "is_executing": false}} URM_all = dataset.get_URM_all() URM_all # + {"pycharm": {"name": "#%%\n", "is_executing": false}} URM_user_mapper = dataset.get_user_original_ID_to_index_mapper() original_users_URM = list(URM_user_mapper.keys()) # + {"pycharm": {"name": "#%%\n", "is_executing": false}} mask = np.in1d(target_users, original_users_URM, assume_unique=True) missing_users = target_users[~mask] missing_users
def get_UCM_all_new(reader: RecSys2019Reader): URM_all = reader.get_URM_all() UCM_all_dict = reader.get_loaded_UCM_dict() ICM_dict = reader.get_loaded_ICM_dict() # Preprocess ICM ICM_dict.pop("ICM_all") ICM_dict = apply_feature_engineering_ICM( ICM_dict, URM_all, UCM_all_dict, ICM_names_to_count=["ICM_sub_class"], UCM_names_to_list=["UCM_age"]) ICM_dict = apply_filtering_ICM( ICM_dict, ICM_name_to_filter_mapper={ "ICM_asset": lambda x: x < np.quantile(x, q=0.75) + 0.72 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)), "ICM_price": lambda x: x < np.quantile(x, q=0.75) + 4 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)) }) ICM_dict = apply_transformation_ICM(ICM_dict, ICM_name_to_transform_mapper={ "ICM_asset": lambda x: np.log1p(1 / x), "ICM_price": lambda x: np.log1p(1 / x), "ICM_item_pop": np.log1p, "ICM_sub_class_count": np.log1p, "ICM_age": lambda x: x**(1 / 2.5) }) ICM_dict = apply_discretization_ICM(ICM_dict, ICM_name_to_bins_mapper={ "ICM_asset": 200, "ICM_price": 200, "ICM_item_pop": 50, "ICM_sub_class_count": 50 }) # Preprocess UCM UCM_all_dict = apply_feature_engineering_UCM( UCM_all_dict, URM_all, ICM_dict, ICM_names_to_UCM=["ICM_sub_class", "ICM_item_pop"]) UCM_all_dict = apply_feature_entropy_UCM( UCM_all_dict, UCM_names_to_entropy=["UCM_sub_class"]) # Apply useful transformation UCM_all_dict = apply_transformation_UCM( UCM_all_dict, UCM_name_to_transform_mapper={"UCM_user_act": np.log1p}) UCM_all_dict = apply_discretization_UCM(UCM_all_dict, UCM_name_to_bins_mapper={ "UCM_user_act": 50, "UCM_sub_class_entropy": 20 }) UCM_all = build_UCM_all_from_dict(UCM_all_dict) return UCM_all