def test_add_UCM_information(self): n_users = 5000 user_id_array = np.arange(n_users) df = get_boosting_base_dataframe(user_id_array, self.main_rec, cutoff=self.cutoff) label_array, _, _ = get_label_array(df, self.URM_train) df['label'] = label_array new_df = add_UCM_information( df, self.data_reader.get_original_user_id_to_index_mapper(), self.path) UCM_age = self.data_reader.get_UCM_from_name("UCM_age") age_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name( "UCM_age") age_demographic = get_user_demographic(UCM_age, age_mapper) UCM_region = self.data_reader.get_UCM_from_name("UCM_region") region_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name( "UCM_region") id_to_original_region_mapper = { v: int(k) for k, v in region_mapper.items() } for i in range(len(new_df)): user = new_df['user_id'].iloc[i] # Test age age = new_df['age'].iloc[i] age_imputed_flag = new_df['age_imputed_flag'].iloc[i] if age_demographic[user] == -1: assert age_imputed_flag == 1 assert age == 5 # Imputed value (mode + 1) else: assert age_imputed_flag == 0 assert age == age_demographic[user] # Test region true_regions = UCM_region.indices[ UCM_region.indptr[user]:UCM_region.indptr[user + 1]] true_regions = [ id_to_original_region_mapper[true_region] for true_region in true_regions ] for region in id_to_original_region_mapper.values(): column_name = "region_{}".format(region) region_in_newdf = new_df[column_name].iloc[i] if region in true_regions: assert region_in_newdf == 1, "User {} has not correct region {}".format( user, region) else: assert region_in_newdf == 0, "User {} has not correct region {}".format( user, region)
def __init__(self, URM_train, UCM_age, ICM_subclass, subclass_feature_to_id_mapper, age_mapper_to_original, recommender: BaseRecommender, rerank_top_n=10): # Data self.URM = URM_train # Retrieving age information self.age_demographic = get_user_demographic(UCM_age, age_mapper_to_original, binned=True) age_list = np.sort(np.array(list(age_mapper_to_original.keys()))) self.age_list = [int(age) for age in age_list] # Subclass information self.subclass_content_dict = get_sub_class_content( ICM_subclass, subclass_feature_to_id_mapper, binned=True) self.subclass_content = get_sub_class_content( ICM_subclass, subclass_feature_to_id_mapper, binned=False) # Age-Subclass self.sub_age_dict = {} self.count_sub_ace_dict = {} # Inner recommender self.inner_recommender = recommender self.rerank_top_n = rerank_top_n # Recommender parameters self.filter_subclass_age = None self.filter_subclass_user = None self.min_num_ratings_subclass_user = None self.users_subclass = np.array([]) self.subclass_rerank = None self.min_num_ratings_subclass_rerank = None self.max_ratings_user_subclass_rerank = None self.filter_price_per_user = None self.filter_asset_per_user = None self.filter_price_per_age = None self.filter_asset_per_age = None super().__init__(URM_train)
else: f = None # Data loading root_data_path = "../../data/" data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=1, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() ICM_all = get_ICM_train(data_reader) UCM_all = get_UCM_train(data_reader) UCM_age = data_reader.get_UCM_from_name("UCM_age") age_feature_to_id_mapper = data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name("UCM_age") age_demographic = get_user_demographic(UCM_age, age_feature_to_id_mapper, binned=True) ICM_subclass = data_reader.get_ICM_from_name("ICM_sub_class") subclass_feature_to_id_mapper = data_reader.dataReader_object.get_ICM_feature_to_index_mapper_from_name( "ICM_sub_class") subclass_content_dict = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=True) subclass_content = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=False) # DOES SUBCLASS DISTRIBUTION CHANGES BETWEEN AGES? # Collect distributions age_list = np.sort(np.array(list(age_feature_to_id_mapper.keys()))) age_list = [int(age) for age in age_list] sub_age_dict = {} for age in age_list: users_age = get_users_of_age(age_demographic=age_demographic, age_list=[age]) URM_age = URM_train[users_age].copy()
UCM_age_region = get_warmer_UCM(UCM_age_region, URM_all, threshold_users=3) UCM_all, _ = merge_UCM(UCM_age_region, URM_train, {}, {}) ICM_categorical = data_reader.get_ICM_from_name("ICM_sub_class") ICM_all, _ = merge_ICM(ICM_categorical, URM_train.T, {}, {}) # Model definition and fitting model = best_models.IALS.get_model(URM_train) version_path = "../../report/graphics/ials/" now = datetime.now().strftime('%b%d_%H-%M-%S') now = now + "_k_out_value_3/" version_path = version_path + "/" + now # Plots demographic_age = get_user_demographic(UCM_age, URM_all, 3) demographic_region = get_user_demographic(UCM_region, URM_all, 3) demographic_list = [demographic_age, demographic_region] demographic_list_name = ['age', 'region'] basic_plots_recommender(model, URM_train, URM_test, output_path_folder=version_path, save_on_file=True, compare_top_pop_points=None, is_compare_top_pop=True, demographic_list=demographic_list, demographic_list_name=demographic_list_name)
if __name__ == '__main__': data_reader = RecSys2019Reader("../../data/") data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() URM_all = data_reader.dataReader_object.get_URM_all() path = "../../report/hp_tuning/p3alpha/Nov23_14-29-55_k_out_value_3/" UCM_region = data_reader.dataReader_object.get_UCM_from_name('UCM_region') region_demographic = get_user_demographic(UCM_region, URM_all, 3) UCM_age = data_reader.dataReader_object.get_UCM_from_name('UCM_age') age_demographic = get_user_demographic(UCM_age, URM_all, 3) demographics = [region_demographic, age_demographic] demographics_names = ["region", "age"] basic_plots_from_tuning_results( path, P3alphaRecommender, URM_train, URM_test, save_on_file=True, demographic_list=demographics, demographic_list_name=demographics_names,
def test_get_train_dataframe_proportion(self): n_users = 500 user_id_array = np.arange(n_users) df = get_train_dataframe_proportion( user_id_array, self.cutoff, self.main_rec, self.path, mapper=self.data_reader.get_original_user_id_to_index_mapper(), recommender_list=[self.main_rec], URM_train=self.URM_train, proportion=1) # Test that the df is ordered by user_id users = df['user_id'].values assert np.all(users[i] <= users[i + 1] for i in range(users.size - 1)) # Test get_boosting_base_dataframe unique_users, user_indptr = np.unique(users, return_index=True) user_indptr = np.concatenate([user_indptr, [users.size]]) true_recommendations = np.array( self.main_rec.recommend(user_id_array=user_id_array, cutoff=self.cutoff, remove_seen_flag=True)) user_recommendations_items = true_recommendations.reshape( (true_recommendations.size, 1)).squeeze() flag = False for i, user in enumerate(user_id_array): df_items = df['item_id'].iloc[ user_indptr[user]:user_indptr[user]].values true_items = user_recommendations_items[i * self.cutoff:i * self.cutoff + self.cutoff] if np.any(np.in1d(df_items, true_items, assume_unique=True)): flag = True break assert flag == False # Test labels value labels = np.array( self.URM_train[df['user_id'].values, df['item_id'].values].tolist()).flatten() assert np.array_equal(labels, df['label'].values) # Test recommender predictions all_scores = self.main_rec._compute_item_score(user_id_array) scaler = MinMaxScaler() scaler.fit(all_scores.reshape(-1, 1)) all_scores = np.reshape(scaler.transform(all_scores.reshape(-1, 1)), newshape=all_scores.shape) for i in range(len(df)): user = df['user_id'].iloc[i] item = df['item_id'].iloc[i] score = df[self.main_rec.RECOMMENDER_NAME].iloc[i] assert score == all_scores[user, item] # Test advanced subclass # Test ICM information # Test UCM information UCM_age = self.data_reader.get_UCM_from_name("UCM_age") age_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name( "UCM_age") age_demographic = get_user_demographic(UCM_age, age_mapper) UCM_region = self.data_reader.get_UCM_from_name("UCM_region") region_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name( "UCM_region") id_to_original_region_mapper = { v: int(k) for k, v in region_mapper.items() } for i in range(len(df)): user = df['user_id'].iloc[i] # Test age age = df['age'].iloc[i] age_imputed_flag = df['age_imputed_flag'].iloc[i] if age_demographic[user] == -1: assert age_imputed_flag == 1 assert age == 5 # Imputed value (mode + 1) else: assert age_imputed_flag == 0 assert age == age_demographic[user] # Test region true_regions = UCM_region.indices[ UCM_region.indptr[user]:UCM_region.indptr[user + 1]] true_regions = [ id_to_original_region_mapper[true_region] for true_region in true_regions ] for region in id_to_original_region_mapper.values(): column_name = "region_{}".format(region) region_in_newdf = df[column_name].iloc[i] if region in true_regions: assert region_in_newdf == 1, "User {} has not correct region {}".format( user, region) else: assert region_in_newdf == 0, "User {} has not correct region {}".format( user, region) # Test user_activity for i in range(len(df)): user = df['user_id'].iloc[i] user_profile_len = df['user_act'].iloc[i] true_user_profile_len = len( self.URM_train.indices[self.URM_train.indptr[user]:self. URM_train.indptr[user + 1]]) assert user_profile_len == true_user_profile_len # Test item_popularity URM_train_csc = self.URM_train.tocsc() for i in range(len(df)): item = df['item_id'].iloc[i] item_pop = df['item_pop'].iloc[i] true_item_pop = len( URM_train_csc.indices[URM_train_csc.indptr[item]:URM_train_csc. indptr[item + 1]]) assert item_pop == true_item_pop