def get_ICM_with_fields(reader: New_DataSplitter_leave_k_out): """ It returns all the ICM_train_all after applying feature engineering :param reader: data splitter :return: return ICM_train_all and feature fields """ URM_train, _ = reader.get_holdout_split() UCM_all_dict = reader.get_loaded_UCM_dict() ICM_all_dict = reader.get_loaded_ICM_dict() ICM_all_dict.pop("ICM_all") ICM_all_dict = apply_feature_engineering_ICM(ICM_all_dict, URM_train, UCM_all_dict, ICM_names_to_count=[], UCM_names_to_list=[]) ICM_all_dict = apply_filtering_ICM( ICM_all_dict, ICM_name_to_filter_mapper={ "ICM_asset": lambda x: x < np.quantile(x, q=0.75) + 0.72 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)), "ICM_price": lambda x: x < np.quantile(x, q=0.75) + 4 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)) }) ICM_all_dict = apply_transformation_ICM(ICM_all_dict, ICM_name_to_transform_mapper={ "ICM_asset": lambda x: np.log1p(1 / x), "ICM_price": lambda x: np.log1p(1 / x), "ICM_item_pop": np.log1p }) ICM_all_dict = apply_discretization_ICM(ICM_all_dict, ICM_name_to_bins_mapper={ "ICM_asset": 200, "ICM_price": 200, "ICM_item_pop": 50 }) ICM_all = None item_feature_fields = None for idx, ICM_key_value in enumerate(ICM_all_dict.items()): ICM_name, ICM_object = ICM_key_value if idx == 0: ICM_all = ICM_object item_feature_fields = np.full(shape=ICM_object.shape[1], fill_value=idx) else: ICM_all = sps.hstack([ICM_all, ICM_object], format="csr") item_feature_fields = np.concatenate([ item_feature_fields, np.full(shape=ICM_object.shape[1], fill_value=idx) ]) return ICM_all, item_feature_fields
def read_split_load_data(k_out, allow_cold_users, seed): root_data_path = os.path.join(get_project_root_path(), "data/") data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out( data_reader, k_out_value=k_out, use_validation_set=False, allow_cold_users=allow_cold_users, force_new_split=True, seed=seed) data_reader.load_data() return data_reader
def get_ICM_train(reader: New_DataSplitter_leave_k_out): """ It returns all the ICM_train_all after applying feature engineering. This preprocessing is used on new_best_models file :param reader: data splitter :return: return ICM_train_all """ URM_train, _ = reader.get_holdout_split() UCM_all_dict = reader.get_loaded_UCM_dict() ICM_all_dict = reader.get_loaded_ICM_dict() ICM_all_dict.pop("ICM_all") ICM_all_dict = apply_feature_engineering_ICM( ICM_all_dict, URM_train, UCM_all_dict, ICM_names_to_count=["ICM_sub_class"], UCM_names_to_list=["UCM_age"]) ICM_all_dict = apply_filtering_ICM( ICM_all_dict, ICM_name_to_filter_mapper={ "ICM_asset": lambda x: x < np.quantile(x, q=0.75) + 0.72 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)), "ICM_price": lambda x: x < np.quantile(x, q=0.75) + 4 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)) }) ICM_all_dict = apply_transformation_ICM(ICM_all_dict, ICM_name_to_transform_mapper={ "ICM_asset": lambda x: np.log1p(1 / x), "ICM_price": lambda x: np.log1p(1 / x), "ICM_item_pop": np.log1p, "ICM_sub_class_count": np.log1p, "ICM_age": lambda x: x**(1 / 2.5) }) ICM_all_dict = apply_discretization_ICM(ICM_all_dict, ICM_name_to_bins_mapper={ "ICM_asset": 200, "ICM_price": 200, "ICM_item_pop": 50, "ICM_sub_class_count": 50 }) ICM_all = build_ICM_all_from_dict(ICM_all_dict) return ICM_all
def get_UCM_train_cold(reader: New_DataSplitter_leave_k_out): """ It returns all the UCM_all after applying feature engineering. This preprocessing is used on new_best_models file :param reader: data splitter :return: return UCM_all """ URM_train, _ = reader.get_holdout_split() UCM_all_dict = reader.get_loaded_UCM_dict() UCM_all_dict = apply_transformation_UCM( UCM_all_dict, UCM_name_to_transform_mapper={"UCM_user_act": np.log1p}) UCM_all_dict = apply_discretization_UCM( UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50}) UCM_all = build_UCM_all_from_dict(UCM_all_dict) return UCM_all
def get_ICM_train_new(reader: New_DataSplitter_leave_k_out): """ It returns all the ICM_train_all after applying feature engineering. :param reader: data splitter :return: return ICM_train_all """ ICM_all_dict = reader.get_loaded_ICM_dict() ICM_all_dict.pop("ICM_all") ICM_all_dict = apply_filtering_ICM( ICM_all_dict, ICM_name_to_filter_mapper={ "ICM_asset": lambda x: x < np.quantile(x, q=0.75) + 0.72 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)), "ICM_price": lambda x: x < np.quantile(x, q=0.75) + 4 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)) }) # Apply useful transformation ICM_all_dict = apply_transformation_ICM(ICM_all_dict, ICM_name_to_transform_mapper={ "ICM_asset": lambda x: np.log1p(1 / x), "ICM_price": lambda x: np.log1p(1 / x), "ICM_item_pop": np.log1p }) ICM_all_dict = apply_discretization_ICM(ICM_all_dict, ICM_name_to_bins_mapper={ "ICM_asset": 200, "ICM_price": 200, "ICM_item_pop": 50 }) # Apply feature weighting ICM_all_dict = apply_transformation_ICM( ICM_all_dict, ICM_name_to_transform_mapper={ "ICM_price": lambda x: x * 1.8474248499810804, "ICM_asset": lambda x: x * 1.2232716972721878, "ICM_sub_class": lambda x: x * 1.662671860026709, "ICM_item_pop": lambda x: x * 0.886528360392298 }) ICM_all = None item_feature_to_range_mapper = {} last_range = 0 for idx, ICM_key_value in enumerate(ICM_all_dict.items()): ICM_name, ICM_object = ICM_key_value if idx == 0: ICM_all = ICM_object else: ICM_all = sps.hstack([ICM_all, ICM_object], format="csr") item_feature_to_range_mapper[ICM_name] = (last_range, last_range + ICM_object.shape[1]) last_range = last_range + ICM_object.shape[1] return ICM_all, item_feature_to_range_mapper
def main(): # Data loading root_data_path = os.path.join(get_project_root_path(), "data/") data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out( data_reader, k_out_value=K_OUT, use_validation_set=False, allow_cold_users=ALLOW_COLD_USERS, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() ICM_all, _ = get_ICM_train_new(data_reader) UCM_all, _ = get_UCM_train_new(data_reader) # Ignoring users ignore_users = get_ignore_users( URM_train, data_reader.get_original_user_id_to_index_mapper(), lower_threshold=LOWER_THRESHOLD, upper_threshold=UPPER_THRESHOLD, ignore_non_target_users=IGNORE_NON_TARGET_USERS) evaluator = EvaluatorHoldout(URM_test, cutoff_list=[CUTOFF], ignore_users=ignore_users) # Model evaluation model = get_model(URM_train, ICM_all, UCM_all) print(evaluator.evaluateRecommender(model))
def get_UCM_with_fields(reader: New_DataSplitter_leave_k_out): """ It returns all the UCM_all after applying feature engineering :param reader: data splitter :return: return UCM_all """ URM_train, _ = reader.get_holdout_split() UCM_all_dict = reader.get_loaded_UCM_dict() ICM_dict = reader.get_loaded_ICM_dict() UCM_all_dict = apply_feature_engineering_UCM( UCM_all_dict, URM_train, ICM_dict, ICM_names_to_UCM=["ICM_sub_class"]) # These are useful feature weighting for UserCBF_CF_Warm UCM_all_dict = apply_transformation_UCM(UCM_all_dict, UCM_name_to_transform_mapper={ "UCM_sub_class": lambda x: x / 2, "UCM_user_act": np.log1p }) UCM_all_dict = apply_discretization_UCM( UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50}) UCM_all = None user_feature_fields = None for idx, UCM_key_value in enumerate(UCM_all_dict.items()): UCM_name, UCM_object = UCM_key_value if idx == 0: UCM_all = UCM_object user_feature_fields = np.full(shape=UCM_object.shape[1], fill_value=idx) else: UCM_all = sps.hstack([UCM_all, UCM_object], format="csr") user_feature_fields = np.concatenate([ user_feature_fields, np.full(shape=UCM_object.shape[1], fill_value=idx) ]) return UCM_all, user_feature_fields
def get_UCM_train(reader: New_DataSplitter_leave_k_out): """ It returns all the UCM_all after applying feature engineering. This preprocessing is used on new_best_models file :param reader: data splitter :return: return UCM_all """ URM_train, _ = reader.get_holdout_split() UCM_all_dict = reader.get_loaded_UCM_dict() ICM_dict = reader.get_loaded_ICM_dict() UCM_all_dict = apply_feature_engineering_UCM( UCM_all_dict, URM_train, ICM_dict, ICM_names_to_UCM=["ICM_sub_class"]) # These are useful feature weighting for UserCBF_CF_Warm UCM_all_dict = apply_transformation_UCM(UCM_all_dict, UCM_name_to_transform_mapper={ "UCM_sub_class": lambda x: x / 2, "UCM_user_act": np.log1p }) UCM_all_dict = apply_discretization_UCM( UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50}) UCM_all = build_UCM_all_from_dict(UCM_all_dict) return UCM_all
from datetime import datetime from course_lib.Base.Evaluation.Evaluator import EvaluatorHoldout from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out from src.data_management.RecSys2019Reader import RecSys2019Reader from src.data_management.dataframe_preprocesser import get_preprocessed_dataframe from src.tuning.holdout_validation.run_parameter_search_advanced_top_pop import run_parameter_search_advanced_top_pop from src.utils.general_utility_functions import get_split_seed if __name__ == '__main__': # Data loading data_reader = RecSys2019Reader("../../data/") data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() mapper = data_reader.get_original_user_id_to_index_mapper() df = get_preprocessed_dataframe("../../data/", keep_warm_only=True) # Setting evaluator # warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > 0 # warm_users = np.arange(URM_train.shape[0])[warm_users_mask] # ignore_users = warm_users cutoff_list = [10] evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list) # HP tuning print("Start tuning...")
output_folder_path = version_path + now + "/" output_file_name = output_folder_path + "results.txt" try: if not os.path.exists(output_folder_path): os.mkdir(output_folder_path) except FileNotFoundError as e: os.makedirs(output_folder_path) f = open(output_file_name, "w") else: f = None # Data loading root_data_path = "../../data/" data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=1, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() ICM_all = get_ICM_train(data_reader) UCM_all = get_UCM_train(data_reader) UCM_age = data_reader.get_UCM_from_name("UCM_age") age_feature_to_id_mapper = data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name("UCM_age") age_demographic = get_user_demographic(UCM_age, age_feature_to_id_mapper, binned=True) ICM_subclass = data_reader.get_ICM_from_name("ICM_sub_class") subclass_feature_to_id_mapper = data_reader.dataReader_object.get_ICM_feature_to_index_mapper_from_name( "ICM_sub_class") subclass_content_dict = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=True) subclass_content = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=False)
UCM_all = sps.hstack([UCM_all, UCM_object], format="csr") user_feature_fields = np.concatenate([ user_feature_fields, np.full(shape=UCM_object.shape[1], fill_value=idx) ]) return UCM_all, user_feature_fields if __name__ == '__main__': data_reader = RecSys2019Reader("../../data/") data_reader = DataPreprocessingRemoveColdUsersItems(data_reader, threshold_items=20, threshold_users=25) data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() ICM_all, item_feature_fields = get_ICM_with_fields(data_reader) UCM_all, user_feature_fields = get_UCM_with_fields(data_reader) user_fields = np.full(shape=URM_train.shape[0], fill_value=0) item_fields = np.full(shape=URM_train.shape[1], fill_value=1) item_feature_fields = item_feature_fields + 2 user_feature_fields = user_feature_fields + np.max(item_feature_fields) + 1 fields = np.concatenate( [user_fields, item_fields, item_feature_fields, user_feature_fields]) positive_URM = URM_train
from src.data_management.RecSys2019Reader_utils import merge_UCM from src.data_management.data_getter import get_warmer_UCM from src.model.HybridRecommender.HybridRankBasedRecommender import HybridRankBasedRecommender import numpy as np from src.model.KNN.UserSimilarityRecommender import UserSimilarityRecommender SEED = 69420 if __name__ == '__main__': seed(SEED) # Data loading data_reader = RecSys2019Reader("../../data/") data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() URM_all = data_reader.dataReader_object.get_URM_all() UCM_age = data_reader.dataReader_object.get_UCM_from_name("UCM_age") UCM_region = data_reader.dataReader_object.get_UCM_from_name("UCM_region") UCM_all, _ = merge_UCM(UCM_age, UCM_region, {}, {}) UCM_all = get_warmer_UCM(UCM_all, URM_all, threshold_users=3) UCM_all, _ = merge_UCM(UCM_all, URM_train, {}, {}) warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > 0 warm_users = np.arange(URM_train.shape[0])[warm_users_mask] # Reset seed for hyper-parameter tuning
from src.data_management.RecSys2019Reader import RecSys2019Reader from src.model_management.model_result_reader import best_model_reader from course_lib.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender from src.data_management.DataPreprocessing import DataPreprocessingRemoveColdUsersItems from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out # + SEED = 69420 seed(SEED) # Data loading dataset = RecSys2019Reader("../data/") dataset = DataPreprocessingRemoveColdUsersItems(dataset, threshold_users=3) dataset = New_DataSplitter_leave_k_out(dataset, k_out_value=3, use_validation_set=False, force_new_split=True) dataset.load_data() seed() # reset random seeds for other things # - URM_train, URM_test = dataset.get_holdout_split() ICM_all = dataset.get_ICM_from_name('ICM_all') best_model_list = best_model_reader("../report/hp_tuning/item_cbf/Nov19_11-23-21_k_out_value_3/") best_model_list cosine_best_model = ItemKNNCBFRecommender(ICM_all, URM_train) cosine_best_model.fit(topK=9, shrink=968, similarity='cosine', normalize=True, feature_weighting='TF-IDF')
import os import numpy as np import xlearn as xl from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out from src.data_management.RecSys2019Reader import RecSys2019Reader from src.data_management.data_preprocessing_fm import format_URM_positive_non_compressed, \ uniform_sampling_strategy, format_URM_positive_user_compressed, \ format_URM_negative_sampling_user_compressed, mix_URM, add_ICM_info from src.utils.general_utility_functions import get_split_seed, get_project_root_path if __name__ == '__main__': dataset = RecSys2019Reader("../../data/") dataset = New_DataSplitter_leave_k_out(dataset, k_out_value=3, use_validation_set=False, force_new_split=True, seed=get_split_seed()) dataset.load_data() URM_train, URM_test = dataset.get_holdout_split() ICM_all = dataset.get_ICM_from_name("ICM_all") URM_positive_FM_matrix = format_URM_positive_user_compressed(URM_train) URM_negative_FM_matrix = format_URM_negative_sampling_user_compressed(URM_train, negative_rate=1, sampling_function=uniform_sampling_strategy, check_replacement=True) URM_FM_matrix = mix_URM(URM_positive_FM_matrix, URM_negative_FM_matrix)[:, :-1] URM_FM_matrix = add_ICM_info(URM_FM_matrix, ICM_all, URM_train.shape[0]) root_path = get_project_root_path() fm_data_path = os.path.join(root_path, "resources", "fm_data") # Prepare train sparse matrix and labels for dumping to file
def get_UCM_train_new(reader: New_DataSplitter_leave_k_out): URM_train, _ = reader.get_holdout_split() UCM_all_dict = reader.get_loaded_UCM_dict() ICM_dict = reader.get_loaded_ICM_dict() # Preprocess ICM ICM_dict.pop("ICM_all") ICM_dict = apply_feature_engineering_ICM( ICM_dict, URM_train, UCM_all_dict, ICM_names_to_count=["ICM_sub_class"], UCM_names_to_list=["UCM_age"]) ICM_dict = apply_filtering_ICM( ICM_dict, ICM_name_to_filter_mapper={ "ICM_asset": lambda x: x < np.quantile(x, q=0.75) + 0.72 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)), "ICM_price": lambda x: x < np.quantile(x, q=0.75) + 4 * (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)) }) ICM_dict = apply_transformation_ICM(ICM_dict, ICM_name_to_transform_mapper={ "ICM_asset": lambda x: np.log1p(1 / x), "ICM_price": lambda x: np.log1p(1 / x), "ICM_item_pop": np.log1p, "ICM_sub_class_count": np.log1p, "ICM_age": lambda x: x**(1 / 2.5) }) ICM_dict = apply_discretization_ICM(ICM_dict, ICM_name_to_bins_mapper={ "ICM_asset": 200, "ICM_price": 200, "ICM_item_pop": 50, "ICM_sub_class_count": 50 }) # Preprocess UCM UCM_all_dict = apply_feature_engineering_UCM( UCM_all_dict, URM_train, ICM_dict, ICM_names_to_UCM=["ICM_sub_class", "ICM_item_pop"]) UCM_all_dict = apply_feature_entropy_UCM( UCM_all_dict, UCM_names_to_entropy=["UCM_sub_class"]) # Apply useful transformation UCM_all_dict = apply_transformation_UCM( UCM_all_dict, UCM_name_to_transform_mapper={"UCM_user_act": np.log1p}) UCM_all_dict = apply_discretization_UCM(UCM_all_dict, UCM_name_to_bins_mapper={ "UCM_user_act": 50, "UCM_sub_class_entropy": 20 }) UCM_all = None user_feature_to_range_mapper = {} last_range = 0 for idx, UCM_key_value in enumerate(UCM_all_dict.items()): UCM_name, UCM_object = UCM_key_value if idx == 0: UCM_all = UCM_object else: UCM_all = sps.hstack([UCM_all, UCM_object], format="csr") user_feature_to_range_mapper[UCM_name] = (last_range, last_range + UCM_object.shape[1]) last_range = last_range + UCM_object.shape[1] return UCM_all, user_feature_to_range_mapper