item_popularity = np.sort(item_popularity) pyplot.plot(item_popularity, 'ro') pyplot.ylabel('Num Interactions ') pyplot.xlabel('Sorted Item') pyplot.show() user_activity = np.ediff1d(URM_all.indptr) user_activity = np.sort(user_activity) pyplot.plot(user_activity, 'ro') pyplot.ylabel('Num Interactions ') pyplot.xlabel('Sorted User') pyplot.show()''' #np.random.seed(1234) URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.90) ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.9) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) URM_ICM_train = sps.vstack([URM_train, ICM_all.T]) URM_ICM_train = URM_ICM_train.tocsr() URM_ICM_train2 = sps.hstack([ICM_all, URM_train.T]) URM_ICM_train2 = URM_ICM_train2.tocsr() earlystopping_keywargs = { "validation_every_n": 10, "stop_on_validation": True, "evaluator_object": evaluator_validation, "lower_validations_allowed": 5,
p3Param = { 'topK': 64, 'alpha': 0.5626527178823623, 'min_rating': 0.4999280105627021, 'implicit': [False, False, False] } alpha1 = 0.4 alpha2 = 0.5399999999999999 alpha3 = 0.06000000000000005 print( "***************************Ensure the parameter is good**********************" ) URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8) itemCF_recommender = ItemKNNCFRecommender(URM_train) itemCF_recommender.fit(**itemCFParam) slim_recommender = SLIM_BPR_Cython(URM_train, recompile_cython=False) slim_recommender.fit(**slimParam) p3_recommender = P3alphaRecommender(URM_train) p3_recommender.fit(**p3Param) recommender1 = SimilarityHybridRecommender(URM_train, itemCF_recommender.W_sparse, slim_recommender.W_sparse, p3_recommender.W_sparse) recommender1.fit(topK=100, alpha1=alpha1, alpha2=alpha2, alpha3=alpha3) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10]) eval_res = evaluator_validation.evaluateRecommender(recommender1)
ICM_asset_path = "data/data_ICM_asset.csv" ICM_asset_file = open(ICM_asset_path, 'r') ICM_price_path = "data/data_ICM_price.csv" ICM_price_file = open(ICM_price_path, 'r') ICM_sub_class = "data/data_ICM_sub_class.csv" ICM_sub_class_file = open(ICM_sub_class, 'r') ICM_all, n_items, n_features = get_ICM(ICM_sub_class, URM_all) print("Number of items is ", str(n_items)) print("n_features is ", str(n_features)) from Notebooks_utils.data_splitter import train_test_holdout URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8) URM_train, URM_validation = train_test_holdout(URM_train, train_perc=0.9) evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10]) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10]) recommender_class = ItemKNNCFRecommender parameterSearch = SearchBayesianSkopt( recommender_class, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test) output_folder_path = "result_experiments/" # If directory does not exist, create
def crossval(URM_all, ICM_all, target_ids, k): seed = 1234 + k #+ int(time.time()) np.random.seed(seed) tp = 0.75 URM_train, URM_test = train_test_holdout(URM_all, train_perc=tp) ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.95) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) args = {} p3alpha = P3alphaRecommender.P3alphaRecommender(URM_train) try: args = { "topK": 991, "alpha": 0.4705816992313091, "normalize_similarity": False } p3alpha.load_model( 'SavedModels\\', p3alpha.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp) + ".zip") except: print("Saved model not found. Fitting a new one...") p3alpha.fit(**args) p3alpha.save_model( 'SavedModels\\', p3alpha.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp)) rp3beta = RP3betaRecommender.RP3betaRecommender(URM_train) try: args = { "topK": 991, "alpha": 0.4705816992313091, "beta": 0.37, "normalize_similarity": False } rp3beta.load_model( 'SavedModels\\', rp3beta.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp) + ".zip") except: print("Saved model not found. Fitting a new one...") rp3beta.fit(**args) rp3beta.save_model( 'SavedModels\\', rp3beta.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp)) itemKNNCF = ItemKNNCFRecommender.ItemKNNCFRecommender(URM_train) try: args = { "topK": 1000, "shrink": 732, "similarity": "cosine", "normalize": True, "feature_weighting": "TF-IDF" } itemKNNCF.load_model( 'SavedModels\\', itemKNNCF.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp) + ".zip") except: print("Saved model not found. Fitting a new one...") itemKNNCF.fit(**args) itemKNNCF.save_model( 'SavedModels\\', itemKNNCF.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp)) userKNNCF = UserKNNCFRecommender.UserKNNCFRecommender(URM_train) try: args = { "topK": 131, "shrink": 2, "similarity": "cosine", "normalize": True } userKNNCF.load_model( 'SavedModels\\', userKNNCF.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp) + ".zip") except: print("Saved model not found. Fitting a new one...") userKNNCF.fit(**args) userKNNCF.save_model( 'SavedModels\\', userKNNCF.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp)) itemKNNCBF = ItemKNNCBFRecommender.ItemKNNCBFRecommender( URM_train, ICM_all) try: args = { "topK": 700, "shrink": 100, "similarity": 'jaccard', "normalize": True, "feature_weighting": "TF-IDF" } itemKNNCBF.load_model( 'SavedModels\\', itemKNNCBF.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp) + ".zip") except: print("Saved model not found. Fitting a new one...") itemKNNCBF.fit(**args) itemKNNCBF.save_model( 'SavedModels\\', itemKNNCBF.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp)) #cfw = CFW_D_Similarity_Linalg.CFW_D_Similarity_Linalg(URM_train, ICM_train, itemKNNCF.W_sparse) #cfw.fit(show_max_performance=False, logFile=None, loss_tolerance=1e-6, # iteration_limit=500000, damp_coeff=0.5, topK=900, add_zeros_quota=0.5, normalize_similarity=True) # Need to change bpr code to avoid memory error, useless since it's bad # bpr = SLIM_BPR_Cython(URM_train, recompile_cython=False) # bpr.fit(**{"topK": 1000, "epochs": 130, "symmetric": False, "sgd_mode": "adagrad", "lambda_i": 1e-05, # "lambda_j": 0.01, "learning_rate": 0.0001}) pureSVD = PureSVDRecommender.PureSVDRecommender(URM_train) pureSVD.fit(num_factors=1000) hyb = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, p3alpha, itemKNNCBF) hyb.fit(alpha=0.5) # Kaggle MAP 0.084 rp3beta, itemKNNCBF hyb2 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, pureSVD, itemKNNCBF) hyb2.fit(alpha=0.5) # Kaggle MAP 0.08667 hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, hyb, hyb2) hyb3.fit(alpha=0.5) #hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, p3alpha, userKNNCF) #hyb3.fit(alpha=0.5) hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF( URM_train, ICM_all) # Kaggle MAP 0.08856 try: # Full values: "alpha_P": 0.4108657561671193, "alpha": 0.6290871066510789 args = { "topK_P": 903, "alpha_P": 0.41086575, "normalize_similarity_P": False, "topK": 448, "shrink": 20, "similarity": "tversky", "normalize": True, "alpha": 0.6290871, "feature_weighting": "TF-IDF" } hyb5.load_model( 'SavedModels\\', hyb5.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp) + ".zip") except: print("Saved model not found. Fitting a new one...") hyb5.fit(**args) hyb5.save_model( 'SavedModels\\', hyb5.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp)) # hyb5.fit(**{"topK_P": 1000, "alpha_P": 0.5432601071314623, "normalize_similarity_P": True, "topK": 620, "shrink": 0, # "similarity": "tversky", "normalize": False, "alpha": 0.5707347522847057, "feature_weighting": "BM25"}) # Kaggle MAP 0.086 :( #hyb6 = ScoresHybrid3Recommender.ScoresHybrid3Recommender(URM_train, rp3beta, itemKNNCBF, p3alpha) #hyb6.fit() hyb6 = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF( URM_train, ICM_all) try: # Full values: "alpha_P": 0.5081918012150626, "alpha": 0.44740093610861603 args = { "topK_P": 623, "alpha_P": 0.5081918, "normalize_similarity_P": False, "topK": 1000, "shrink": 1000, "similarity": "tversky", "normalize": True, "alpha": 0.4474009, "beta_P": 0.0, "feature_weighting": "TF-IDF" } hyb6.load_model( 'SavedModels\\', hyb6.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp) + ".zip") except: print("Saved model not found. Fitting a new one...") hyb6.fit(**args) hyb6.save_model( 'SavedModels\\', hyb6.RECOMMENDER_NAME + toFileName(args) + ",s=" + str(seed) + ",tp=" + str(tp)) v0 = evaluator_validation.evaluateRecommender(hyb)[0][10]["MAP"] v1 = evaluator_validation.evaluateRecommender(hyb2)[0][10]["MAP"] v2 = evaluator_validation.evaluateRecommender(hyb3)[0][10]["MAP"] v3 = evaluator_validation.evaluateRecommender(hyb5)[0][10]["MAP"] v4 = evaluator_validation.evaluateRecommender(hyb6)[0][10]["MAP"] #item_list = hyb3.recommend(target_ids, cutoff=10) #CreateCSV.create_csv(target_ids, item_list, 'ItemKNNCBF__RP3beta') return [v0, v1, v2, v3, v4]
def gethyb(): start_time = time.time() URM_all, user_id_unique, item_id_unique = RecSys2020Reader.load_urm() ICM_all = RecSys2020Reader.load_icm_asset() target_ids = RecSys2020Reader.load_target() np.random.seed(12341288) URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8) # ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.995) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) #URM_train = URM_all ICM_train = ICM_all URM_ICM_train = sps.vstack([URM_train, ICM_all.T]) URM_ICM_train = URM_ICM_train.tocsr() l_list = [] profile_length = np.ediff1d(URM_train.indptr) block_size = int(len(profile_length) * 0.2) sorted_users = np.argsort(profile_length) groups = 5 rec_list = [] arg_list = [] name_list = [] for group_id in range(0, groups): start_pos = group_id * block_size end_pos = min((group_id + 1) * block_size, len(profile_length)) users_in_group = sorted_users[start_pos:end_pos] users_in_group_p_len = profile_length[users_in_group] l_list.append(len(users_in_group)) print("Group {}, average p.len {:.2f}, min {}, max {}".format( group_id, users_in_group_p_len.mean(), users_in_group_p_len.min(), users_in_group_p_len.max())) hyb_warm = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF( URM_ICM_train, URM_ICM_train.T) hyb_warmV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF( URM_ICM_train, URM_ICM_train.T) # Warm of Kaggle MAP 0.09466 '''hyb_warm_args = {"topK_P": 127, "alpha_P": 0.35309465855346317, "normalize_similarity_P": False, "topK": 805, "shrink": 307, "similarity": "tversky", "normalize": False, "alpha": 0.486665735781842, "feature_weighting": "TF-IDF"} hyb_warmV2_args = {"topK_P": 1496, "alpha_P": 0.4384309705759645, "normalize_similarity_P": False, "topK": 1023, "shrink": 261, "similarity": "asymmetric", "normalize": False, "alpha": 0.7211670365702352, "feature_weighting": "TF-IDF"}''' hyb_warm_args = { "topK_P": 2000, "alpha_P": 0.5202318972174075, "normalize_similarity_P": False, "topK": 2000, "shrink": 2000, "similarity": "tversky", "normalize": True, "alpha": 1.0, "beta_P": 0.33040913500424834, "feature_weighting": "none" } hyb_warmV2_args = { "topK_P": 1238, "alpha_P": 0.580501466821829, "normalize_similarity_P": False, "topK": 1043, "shrink": 163, "similarity": "asymmetric", "normalize": False, "alpha": 0.25081946305309705, "feature_weighting": "BM25" } #{"topK_P": 2000, "alpha_P": 0.5292482627931302, "normalize_similarity_P": False, "topK": 2000, "shrink": 0, #"similarity": "tanimoto", "normalize": True, "alpha": 0.7963434906265208, "beta_P": 0.2692980157925566, "feature_weighting": "BM25"} hyb_cold = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF( URM_ICM_train, URM_ICM_train.T) # Cold of Kaggle MAP 0.09466 hyb_coldV2 = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF( URM_ICM_train, URM_ICM_train.T) '''hyb_cold_args = {"topK_P": 482, "alpha_P": 0.4999498678468517, "normalize_similarity_P": False, "topK": 1500, "shrink": 212, "similarity": "cosine", "normalize": False, "alpha": 0.6841610038073574, "feature_weighting": "BM25"} # Cold of Kaggle MAP 0.09466 hyb_coldV2_args = {"topK_P": 326, "alpha_P": 0.5120656418370607, "normalize_similarity_P": False, "topK": 151, "shrink": 183, "similarity": "tversky", "normalize": True, "alpha": 0.6290067931193662, "feature_weighting": "BM25"}''' hyb_cold_args = { "topK_P": 2093, "alpha_P": 0.8263868403373367, "normalize_similarity_P": False, "topK": 298, "shrink": 1954, "similarity": "tanimoto", "normalize": False, "alpha": 0.608862998163905, "beta_P": 0.34975586706651757, "feature_weighting": "TF-IDF" } # Cold of Kaggle MAP 0.09466 hyb_coldV2_args = { "topK_P": 1490, "alpha_P": 0.5832972099071866, "normalize_similarity_P": False, "topK": 1533, "shrink": 1100, "similarity": "tanimoto", "normalize": False, "alpha": 0.15358895478386428, "beta_P": 0.002234792201790459, "feature_weighting": "BM25" } '''hyb_midV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_ICM_train, URM_ICM_train.T) # Cold of Kaggle MAP 0.09466 hyb_midV2_args = {"topK_P": 2064, "alpha_P": 1.9131180703120496, "normalize_similarity_P": False, "topK": 154, "shrink": 620, "similarity": "asymmetric", "normalize": True, "alpha": 0.013221786654690208, "feature_weighting": "TF-IDF"} #{"topK_P": 1577, "alpha_P": 0.1835912052126545, "normalize_similarity_P": false, "topK": 1439, "shrink": 3626, #"similarity": "cosine", "normalize": false, "alpha": 0.1507714323088927, "feature_weighting": "BM25"}''' rec_list.append(hyb_cold) arg_list.append(hyb_cold_args) name_list.append("hyb_cold") rec_list.append(hyb_warm) arg_list.append(hyb_warm_args) name_list.append("hyb_warm") rec_list.append(hyb_warmV2) arg_list.append(hyb_warmV2_args) name_list.append("hyb_warmV2") rec_list.append(hyb_coldV2) arg_list.append(hyb_coldV2_args) name_list.append("hyb_coldV2") '''rec_list.append(hyb_midV2) arg_list.append(hyb_midV2_args) name_list.append("hyb_midV2")''' hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF( URM_train, ICM_train) hyb5_args = { "topK_P": 903, "alpha_P": 0.4108657561671193, "normalize_similarity_P": False, "topK": 448, "shrink": 5, "similarity": "tversky", "normalize": True, "alpha": 0.6290871066510789, "feature_weighting": "TF-IDF" } rec_list.append(hyb5) arg_list.append(hyb5_args) name_list.append("hyb5") tot_args = zip(rec_list, arg_list, name_list) pool = PoolWithSubprocess(processes=5, maxtasksperchild=1) resultList = pool.map(fitRec, tot_args) pool.close() pool.join() for el in resultList: if el[1] == "hyb_cold": hyb_cold = el[0] elif el[1] == "hyb_warm": hyb_warm = el[0] elif el[1] == "hyb_coldV2": hyb_coldV2 = el[0] elif el[1] == "hyb_midV2": hyb_midV2 = el[0] elif el[1] == "hyb_warmV2": hyb_warmV2 = el[0] elif el[1] == "hyb5": hyb5 = el[0] elif el[1] == "hyb6x": hyb6x = el[0] # cold coldv2 mid sono i nuovi #hyb = hyb_warm #hyb2 = hyb_cold hyb3 = ScoresHybridKNNCFKNNCBF.ScoresHybridKNNCFKNNCBF( URM_ICM_train, URM_ICM_train.T) hyb3.fit( **{ "topK_CF": 488, "shrink_CF": 1500, "similarity_CF": "tversky", "normalize_CF": True, "topK": 1500, "shrink": 1500, "similarity": "asymmetric", "normalize": False, "alpha": 0.23233349150222427, "feature_weighting": "BM25" }) hyb2 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, hyb_warm, hyb5) hyb2.fit(alpha=0.5) hyb6 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, hyb_warmV2, hyb5) hyb6.fit(alpha=0.5) hyb7 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, hyb6, hyb2) hyb7.fit(alpha=0.5) #hyb = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, hyb3, hyb7) #hyb.fit(alpha=0.5) earlystopping_keywargs = { "validation_every_n": 1, "stop_on_validation": True, "evaluator_object": evaluator_validation, "lower_validations_allowed": 3, "validation_metric": "MAP", } ials = IALSRecommender.IALSRecommender(URM_ICM_train) ials.fit(**earlystopping_keywargs, num_factors=100, alpha=50) hyb = ials hyb7 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, hyb2, ials) hyb7.fit(alpha=0.5) hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, hyb2, ials) hyb3.fit(alpha=0.85) MAP_p3alpha_per_group = [] MAP_itemKNNCF_per_group = [] MAP_itemKNNCBF_per_group = [] MAP_pureSVD_per_group = [] MAP_hyb_per_group = [] MAP_hyb2_per_group = [] MAP_hyb3_per_group = [] MAP_hyb5_per_group = [] MAP_hyb6_per_group = [] MAP_hyb7_per_group = [] cutoff = 10 args = { "block_size": block_size, "profile_length": profile_length, "sorted_users": sorted_users, "cutoff": cutoff, "URM_test": URM_test, "hyb": hyb, "hyb2": hyb2, "hyb3": hyb3, "hyb5": hyb5, "hyb6": hyb6, "hyb7": hyb7 } pool = PoolWithSubprocess(processes=multiprocessing.cpu_count() - 1, maxtasksperchild=1) compute_group_MAP_partial = partial(compute_group_MAP, args) resultList = pool.map(compute_group_MAP_partial, range(0, groups)) pool.close() pool.join() for el in resultList: MAP_hyb_per_group.append(el[0]) MAP_hyb2_per_group.append(el[1]) MAP_hyb3_per_group.append(el[2]) MAP_hyb5_per_group.append(el[3]) MAP_hyb6_per_group.append(el[4]) if hyb7 is not None: MAP_hyb7_per_group.append(el[5]) # Needed because of memory error '''for group_id in range(0, groups): start_pos = group_id * block_size end_pos = min((group_id + 1) * block_size, len(profile_length)) users_in_group = sorted_users[start_pos:end_pos] users_in_group_p_len = profile_length[users_in_group] print("Group {}, average p.len {:.2f}, min {}, max {}".format(group_id, users_in_group_p_len.mean(), users_in_group_p_len.min(), users_in_group_p_len.max())) users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert=True) users_not_in_group = sorted_users[users_not_in_group_flag] evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[cutoff], ignore_users=users_not_in_group) results, _ = evaluator_test.evaluateRecommender(hyb7) MAP_hyb7_per_group.append(results[cutoff]["MAP"])''' import matplotlib.pyplot as pyplot '''pyplot.plot(MAP_p3alpha_per_group, label="p3alpha") pyplot.plot(MAP_itemKNNCF_per_group, label="itemKNNCF") pyplot.plot(MAP_itemKNNCBF_per_group, label="itemKNNCBF") pyplot.plot(MAP_pureSVD_per_group, label="pureSVD")''' pyplot.plot(MAP_hyb_per_group, label="hyb") pyplot.plot(MAP_hyb2_per_group, label="hyb2") pyplot.plot(MAP_hyb3_per_group, label="hyb3") pyplot.plot(MAP_hyb5_per_group, label="hyb5") pyplot.plot(MAP_hyb6_per_group, label="hyb6") if hyb7 is not None: pyplot.plot(MAP_hyb7_per_group, label="hyb7") pyplot.ylabel('MAP') pyplot.xlabel('User Group') pyplot.legend() pyplot.show() print(l_list) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) pool = PoolWithSubprocess(processes=multiprocessing.cpu_count() - 1, maxtasksperchild=1) if hyb7 is not None: hyb_list = [hyb, hyb2, hyb3, hyb5, hyb6, hyb7] else: hyb_list = [hyb, hyb2, hyb3, hyb5, hyb6] resultList = pool.map(evaluator_validation.evaluateRecommender, hyb_list) pool.close() pool.join() for el in resultList: print(el) '''item_list = hyb7.recommend(target_ids, cutoff=10) CreateCSV.create_csv(target_ids, item_list, 'Hyb_URM_ICM_cold_warm_V2_more_mix_mid') item_list = hyb2.recommend(target_ids, cutoff=10) CreateCSV.create_csv(target_ids, item_list, 'Hyb2') item_list = hyb6.recommend(target_ids, cutoff=10) CreateCSV.create_csv(target_ids, item_list, 'Hyb_URM_ICM')''' print("--- Execution time: %s seconds ---" % (time.time() - start_time)) return hyb2
def crossval(URM_all, ICM_all, target_ids, k): seed = 1234 + k #+ int(time.time()) np.random.seed() URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.90) ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.95) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) args = {} p3alpha = P3alphaRecommender.P3alphaRecommender(URM_train) args = { "topK": 991, "alpha": 0.4705816992313091, "normalize_similarity": False } p3alpha.fit(**args) #p3alpha2 = P3alphaRecommender.P3alphaRecommender(URM_train) #args = {"topK": 400, "alpha": 0.5305816992313091, "normalize_similarity": False} #p3alpha2.fit(**args) #rp3beta = RP3betaRecommender.RP3betaRecommender(URM_train) #args = {"topK": 991, "alpha": 0.4705816992313091, "beta": 0.15, "normalize_similarity": False} #rp3beta.fit(**args) itemKNNCF = ItemKNNCFRecommender.ItemKNNCFRecommender(URM_train) args = { "topK": 1000, "shrink": 732, "similarity": "cosine", "normalize": True, "feature_weighting": "TF-IDF" } itemKNNCF.fit(**args) userKNNCF = UserKNNCFRecommender.UserKNNCFRecommender(URM_train) args = { "topK": 131, "shrink": 2, "similarity": "cosine", "normalize": True } userKNNCF.fit(**args) itemKNNCBF = ItemKNNCBFRecommender.ItemKNNCBFRecommender( URM_train, ICM_all) args = { "topK": 700, "shrink": 100, "similarity": 'jaccard', "normalize": True, "feature_weighting": "TF-IDF" } itemKNNCBF.fit(**args) itemKNNCBF2 = ItemKNNCBFRecommender.ItemKNNCBFRecommender( URM_train, ICM_all) args = { "topK": 200, "shrink": 15, "similarity": 'jaccard', "normalize": True, "feature_weighting": "TF-IDF" } itemKNNCBF2.fit(**args) #cfw = CFW_D_Similarity_Linalg.CFW_D_Similarity_Linalg(URM_train, ICM_train, itemKNNCF.W_sparse) #cfw.fit(show_max_performance=False, logFile=None, loss_tolerance=1e-6, # iteration_limit=500000, damp_coeff=0.5, topK=900, add_zeros_quota=0.5, normalize_similarity=True) # Need to change bpr code to avoid memory error, useless since it's bad #bpr = SLIM_BPR_Cython(URM_train, recompile_cython=False) #bpr.fit(**{"topK": 1000, "epochs": 130, "symmetric": False, "sgd_mode": "adagrad", "lambda_i": 1e-05, # "lambda_j": 0.01, "learning_rate": 0.0001}) pureSVD = PureSVDRecommender.PureSVDRecommender(URM_train) pureSVD.fit(num_factors=340) #hyb = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, p3alpha, itemKNNCBF) #hyb.fit(alpha=0.5) hyb = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, itemKNNCBF, pureSVD) hyb.fit(alpha=0.5) # Kaggle MAP 0.084 rp3beta, itemKNNCBF #hyb2 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, p3alpha, itemKNNCBF) #hyb2.fit(alpha=0.5) hyb2 = ItemKNNSimilarityHybridRecommender.ItemKNNSimilarityHybridRecommender( URM_train, itemKNNCBF.W_sparse, itemKNNCF.W_sparse) hyb2.fit(topK=1600) # Kaggle MAP 0.08667 hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender( URM_train, hyb, hyb2) hyb3.fit(alpha=0.5) #hyb3 = RankingHybrid.RankingHybrid(URM_train, hyb, hyb2) #hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, p3alpha, userKNNCF) #hyb3.fit(alpha=0.5) hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF( URM_train, ICM_all) # Kaggle MAP 0.08856 args = { "topK_P": 903, "alpha_P": 0.4108657561671193, "normalize_similarity_P": False, "topK": 448, "shrink": 20, "similarity": "tversky", "normalize": True, "alpha": 0.6290871066510789, "feature_weighting": "TF-IDF" } hyb5.fit(**args) # hyb5.fit(**{"topK_P": 1000, "alpha_P": 0.5432601071314623, "normalize_similarity_P": True, "topK": 620, "shrink": 0, # "similarity": "tversky", "normalize": False, "alpha": 0.5707347522847057, "feature_weighting": "BM25"}) # Kaggle MAP 0.086 :( #hyb6 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, hyb3, hyb5) #hyb6.fit() hyb6 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF( URM_train, ICM_all) args = { "topK_P": 756, "alpha_P": 0.5292654015790155, "normalize_similarity_P": False, "topK": 1000, "shrink": 47, "similarity": "tversky", "normalize": False, "alpha": 0.5207647439152092, "feature_weighting": "none" } hyb6.fit(**args) '''hyb6 = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF(URM_train, ICM_all) args = {"topK_P": 623, "alpha_P": 0.5081918012150626, "normalize_similarity_P": False, "topK": 1000, "shrink": 1000, "similarity": "tversky", "normalize": True, "alpha": 0.44740093610861603, "beta_P": 0.0, "feature_weighting": "TF-IDF"} hyb6.fit(**args)''' hyb7 = RankingHybrid.RankingHybrid(URM_train, hyb6, hyb3) v0 = evaluator_validation.evaluateRecommender(hyb)[0][10]["MAP"] v1 = evaluator_validation.evaluateRecommender(hyb2)[0][10]["MAP"] v2 = evaluator_validation.evaluateRecommender(hyb3)[0][10]["MAP"] #v2 = 0 v3 = evaluator_validation.evaluateRecommender(hyb5)[0][10]["MAP"] v4 = evaluator_validation.evaluateRecommender(hyb6)[0][10]["MAP"] #v4 = 0 v5 = evaluator_validation.evaluateRecommender(hyb7)[0][10]["MAP"] #item_list = hyb6.recommend(target_ids, cutoff=10) #CreateCSV.create_csv(target_ids, item_list, 'HybPureSVD') return [v0, v1, v2, v3, v4, v5]
def gethyb(): start_time = time.time() URM_all, user_id_unique, item_id_unique = RecSys2020Reader.load_urm() ICM_all = RecSys2020Reader.load_icm_asset() target_ids = RecSys2020Reader.load_target() #np.random.seed(12341288) URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8) # ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.995) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) #URM_train = URM_all ICM_train = ICM_all URM_ICM_train = sps.vstack([URM_train, ICM_all.T]) URM_ICM_train = URM_ICM_train.tocsr() l_list = [] profile_length = np.ediff1d(URM_train.indptr) block_size = int(len(profile_length) * 0.2) sorted_users = np.argsort(profile_length) groups = 5 rec_list = [] arg_list = [] name_list = [] for group_id in range(0, groups): start_pos = group_id * block_size end_pos = min((group_id + 1) * block_size, len(profile_length)) users_in_group = sorted_users[start_pos:end_pos] users_in_group_p_len = profile_length[users_in_group] l_list.append(len(users_in_group)) print("Group {}, average p.len {:.2f}, min {}, max {}".format(group_id, users_in_group_p_len.mean(), users_in_group_p_len.min(), users_in_group_p_len.max())) hyb_warm = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, URM_train.T) hyb_warmV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train) # Warm of Kaggle MAP 0.09466 '''hyb_warm_args = {"topK_P": 127, "alpha_P": 0.35309465855346317, "normalize_similarity_P": False, "topK": 805, "shrink": 307, "similarity": "tversky", "normalize": False, "alpha": 0.486665735781842, "feature_weighting": "TF-IDF"} hyb_warmV2_args = {"topK_P": 1496, "alpha_P": 0.4384309705759645, "normalize_similarity_P": False, "topK": 1023, "shrink": 261, "similarity": "asymmetric", "normalize": False, "alpha": 0.7211670365702352, "feature_weighting": "TF-IDF"}''' hyb_warm_args = {"topK_P": 1500, "alpha_P": 0.499386187332916, "normalize_similarity_P": False, "topK": 1500, "shrink": 0, "similarity": "cosine", "normalize": False, "alpha": 0.6783844599810798, "feature_weighting": "BM25"} hyb_warmV2_args = {"topK_P": 1407, "alpha_P": 0.5102184063631549, "normalize_similarity_P": False, "topK": 62, "shrink": 104, "similarity": "tanimoto", "normalize": False, "alpha": 0.7722938163027667, "feature_weighting": "none"} hyb_cold = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train) # Cold of Kaggle MAP 0.09466 hyb_coldV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train) '''hyb_cold_args = {"topK_P": 482, "alpha_P": 0.4999498678468517, "normalize_similarity_P": False, "topK": 1500, "shrink": 212, "similarity": "cosine", "normalize": False, "alpha": 0.6841610038073574, "feature_weighting": "BM25"} # Cold of Kaggle MAP 0.09466 hyb_coldV2_args = {"topK_P": 326, "alpha_P": 0.5120656418370607, "normalize_similarity_P": False, "topK": 151, "shrink": 183, "similarity": "tversky", "normalize": True, "alpha": 0.6290067931193662, "feature_weighting": "BM25"}''' hyb_cold_args = {"topK_P": 510, "alpha_P": 0.2857363628982497, "normalize_similarity_P": False, "topK": 483, "shrink": 1491, "similarity": "asymmetric", "normalize": True, "alpha": 0.7682805033640728, "feature_weighting": "TF-IDF"} # Cold of Kaggle MAP 0.09466 hyb_coldV2_args = {"topK_P": 1095, "alpha_P": 0.4546298466859472, "normalize_similarity_P": False, "topK": 866, "shrink": 182, "similarity": "tanimoto", "normalize": False, "alpha": 0.5837079437871213, "feature_weighting": "BM25"} hyb_midV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train) # Cold of Kaggle MAP 0.09466 hyb_midV2_args = {"topK_P": 482, "alpha_P": 0.4999498678468517, "normalize_similarity_P": False, "topK": 1500, "shrink": 212, "similarity": "cosine", "normalize": False, "alpha": 0.6841610038073574, "feature_weighting": "BM25"} rec_list.append(hyb_cold) arg_list.append(hyb_cold_args) name_list.append("hyb_cold") rec_list.append(hyb_warm) arg_list.append(hyb_warm_args) name_list.append("hyb_warm") rec_list.append(hyb_warmV2) arg_list.append(hyb_warmV2_args) name_list.append("hyb_warmV2") rec_list.append(hyb_coldV2) arg_list.append(hyb_coldV2_args) name_list.append("hyb_coldV2") rec_list.append(hyb_midV2) arg_list.append(hyb_midV2_args) name_list.append("hyb_midV2") hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train) hyb5_args = {"topK_P": 903, "alpha_P": 0.4108657561671193, "normalize_similarity_P": False, "topK": 448, "shrink": 5, "similarity": "tversky", "normalize": True, "alpha": 0.6290871066510789, "feature_weighting": "TF-IDF"} rec_list.append(hyb5) arg_list.append(hyb5_args) name_list.append("hyb5") tot_args = zip(rec_list, arg_list, name_list) pool = PoolWithSubprocess(processes=int(multiprocessing.cpu_count()-1), maxtasksperchild=1) resultList = pool.map(fitRec, tot_args) pool.close() pool.join() for el in resultList: if el[1] == "hyb_cold": hyb_cold = el[0] elif el[1] == "hyb_warm": hyb_warm = el[0] elif el[1] == "hyb_coldV2": hyb_coldV2 = el[0] elif el[1] == "hyb_midV2": hyb_midV2 = el[0] elif el[1] == "hyb_warmV2": hyb_warmV2 = el[0] elif el[1] == "hyb5": hyb5 = el[0] elif el[1] == "hyb6x": hyb6x = el[0] hybuc = ScoresHybridSpecializedV3Warm.ScoresHybridSpecializedV3Warm(URM_train, ICM_all) hybuc.fit(**{"topK_P": 509, "alpha_P": 1.045671409326966, "normalize_similarity_P": False, "topK": 1291, "shrink": 430, "similarity": "asymmetric", "normalize": False, "alpha": 0.864672904054673, "feature_weighting": "TF-IDF"}) hyb2 = hyb_warmV2 hyb3 = ScoresHybridSpecializedFusion.ScoresHybridSpecializedFusion(URM_train, hyb_cold, hyb_warm, 5.9) hyb7 = ScoresHybridSpecializedFusion.ScoresHybridSpecializedFusion(URM_train, hyb_coldV2, hyb_warmV2, 5.9) hyb6 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, hyb7, hyb5) hyb6.fit(alpha=0.5) hyb2 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, hyb3, hyb6) hyb2.fit(alpha=0.5) hyb = ScoresHybridSpecializedFusion.ScoresHybridSpecializedFusion(URM_train, hyb2, hybuc, 300) MAP_p3alpha_per_group = [] MAP_itemKNNCF_per_group = [] MAP_itemKNNCBF_per_group = [] MAP_pureSVD_per_group = [] MAP_hyb_per_group = [] MAP_hyb2_per_group = [] MAP_hyb3_per_group = [] MAP_hyb5_per_group = [] MAP_hyb6_per_group = [] MAP_hyb7_per_group = [] cutoff = 10 args = {"block_size": block_size, "profile_length": profile_length, "sorted_users": sorted_users, "cutoff": cutoff, "URM_test": URM_test, "hyb": hyb, "hyb2": hyb2, "hyb3": hyb3, "hyb5": hyb5, "hyb6": hyb6, "hyb7": hyb7} pool = PoolWithSubprocess(processes=multiprocessing.cpu_count()-1, maxtasksperchild=1) compute_group_MAP_partial = partial(compute_group_MAP, args) resultList = pool.map(compute_group_MAP_partial, range(0, groups)) pool.close() pool.join() for el in resultList: MAP_hyb_per_group.append(el[0]) MAP_hyb2_per_group.append(el[1]) MAP_hyb3_per_group.append(el[2]) MAP_hyb5_per_group.append(el[3]) MAP_hyb6_per_group.append(el[4]) if hyb7 is not None: MAP_hyb7_per_group.append(el[5]) # Needed because of memory error '''for group_id in range(0, groups): start_pos = group_id * block_size end_pos = min((group_id + 1) * block_size, len(profile_length)) users_in_group = sorted_users[start_pos:end_pos] users_in_group_p_len = profile_length[users_in_group] print("Group {}, average p.len {:.2f}, min {}, max {}".format(group_id, users_in_group_p_len.mean(), users_in_group_p_len.min(), users_in_group_p_len.max())) users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert=True) users_not_in_group = sorted_users[users_not_in_group_flag] evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[cutoff], ignore_users=users_not_in_group) results, _ = evaluator_test.evaluateRecommender(hyb7) MAP_hyb7_per_group.append(results[cutoff]["MAP"])''' import matplotlib.pyplot as pyplot '''pyplot.plot(MAP_p3alpha_per_group, label="p3alpha") pyplot.plot(MAP_itemKNNCF_per_group, label="itemKNNCF") pyplot.plot(MAP_itemKNNCBF_per_group, label="itemKNNCBF") pyplot.plot(MAP_pureSVD_per_group, label="pureSVD")''' pyplot.plot(MAP_hyb_per_group, label="hyb") pyplot.plot(MAP_hyb2_per_group, label="hyb2") pyplot.plot(MAP_hyb3_per_group, label="hyb3") pyplot.plot(MAP_hyb5_per_group, label="hyb5") pyplot.plot(MAP_hyb6_per_group, label="hyb6") if hyb7 is not None: pyplot.plot(MAP_hyb7_per_group, label="hyb7") pyplot.ylabel('MAP') pyplot.xlabel('User Group') pyplot.legend() pyplot.show() print(l_list) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) pool = PoolWithSubprocess(processes=multiprocessing.cpu_count()-1, maxtasksperchild=1) if hyb7 is not None: hyb_list = [hyb, hyb2, hyb3, hyb5, hyb6, hyb7] else: hyb_list = [hyb, hyb2, hyb3, hyb5, hyb6] resultList = pool.map(evaluator_validation.evaluateRecommender, hyb_list) pool.close() pool.join() for el in resultList: print(el) '''item_list = hyb7.recommend(target_ids, cutoff=10) CreateCSV.create_csv(target_ids, item_list, 'Hyb_URM_ICM_cold_warm_V2_more_mix_mid') item_list = hyb2.recommend(target_ids, cutoff=10) CreateCSV.create_csv(target_ids, item_list, 'Hyb2') item_list = hyb6.recommend(target_ids, cutoff=10) CreateCSV.create_csv(target_ids, item_list, 'Hyb_URM_ICM')''' print("--- Execution time: %s seconds ---" % (time.time() - start_time)) return hyb2