def __init__(self): self.HYP = {} self.report_counter = 60 self.writer = Writer() # Some parameters self.hyperparams = dict() self.hyperparams_names = list() self.hyperparams_values = list() self.hyperparams_single_value = dict() # Extractor for matricies extractor = Extractor() urm = extractor.get_urm_all() self.icm = extractor.get_icm_all() # Splitting into post-validation & testing in case of parameter tuning matrices = loo.split_train_leave_k_out_user_wise(urm, 1, False, True) self.urm_post_validation = matrices[0] self.urm_test = matrices[1] # Splitting the post-validation matrix in train & validation # (Problem of merging train and validation again at the end => loo twice) matrices_for_validation = loo.split_train_leave_k_out_user_wise( self.urm_post_validation, 1, False, True) self.urm_train = matrices_for_validation[0] self.urm_validation = matrices_for_validation[1]
def __init__(self): self.extractor = Extractor self.writer = Writer() self.my_path = self.extractor.DATA_FILE_PATH + "xL_data/prova.txt" # Some useful variables self.urm = self.extractor.get_urm_all(self.extractor) self.ucm_age = self.extractor.get_ucm_age(self.extractor).tocsr() self.ucm_region = self.extractor.get_ucm_region(self.extractor).tocsr() self.icm_asset = self.get_icm_asset() self.icm_price = self.get_icm_price() self.icm_sub_cat = self.extractor.get_icm_subclass( self.extractor).tocsr()
class Optimizer(object): def __init__(self): self.HYP = {} self.report_counter = 60 self.writer = Writer() # Some parameters self.hyperparams = dict() self.hyperparams_names = list() self.hyperparams_values = list() self.hyperparams_single_value = dict() # Extractor for matricies extractor = Extractor() urm = extractor.get_urm_all() self.icm = extractor.get_icm_all() # Splitting into post-validation & testing in case of parameter tuning matrices = loo.split_train_leave_k_out_user_wise(urm, 1, False, True) self.urm_post_validation = matrices[0] self.urm_test = matrices[1] # Splitting the post-validation matrix in train & validation # (Problem of merging train and validation again at the end => loo twice) matrices_for_validation = loo.split_train_leave_k_out_user_wise( self.urm_post_validation, 1, False, True) self.urm_train = matrices_for_validation[0] self.urm_validation = matrices_for_validation[1] def optimeze_weights(self): # weights = {'icfknn': 2, 'ucfknn': 0.2, 'cbfknn': 0.5, 'slimbpr': 1, 'puresvd': 1.5, 'als': 1, 'cfw': 3, 'p3a': 2, 'rp3b': 3} weights = {} weights["icfknn"] = Real( low=0, high=5, prior='uniform') # high=100000, prior='log-uniform') weights["ucfknn"] = Real(low=0, high=5, prior='uniform') weights["cbfknn"] = Real(low=0, high=5, prior='uniform') weights["slimbpr"] = Real(low=0, high=5, prior='uniform') weights["puresvd"] = Real(low=0, high=5, prior='uniform') #weights["als"] = Real(low=0, high=5, prior='uniform') weights["p3a"] = Real(low=0, high=5, prior='uniform') weights["rp3b"] = Real(low=0, high=5, prior='uniform') return weights def rebuild_weights(self, array): return { "icfknn": array[0], "ucfknn": array[1], "cbfknn": array[2], "slimbpr": array[3], "puresvd": array[4], "p3a": array[5], "rp3b": array[6] } def optimize_single_KNN(self): parameters = { "topK": Integer(5, 800), "shrink": Integer(0, 1000), "similarity": Categorical(similarity_type), "normalize": Categorical([True, False]) } if parameters["similarity"] == "asymmetric": parameters["normalize"] = Categorical([True]) elif parameters["similarity"] == "tversky": parameters["normalize"] = Categorical([True]) parameters["asymmetric_alpha"] = Real(low=0, high=2, prior='uniform') parameters["tversky_alpha"] = Real(low=0, high=2, prior='uniform') parameters["tversky_beta"] = Real(low=0, high=2, prior='uniform') return parameters def rebuild_single_KNN(self, array): return { "topK": array[0], "shrink": array[1], "similarity": array[2], "normalize": array[3], "asymmetric_alpha": array[4], "tversky_alpha": array[5], "tversky_beta": array[6] } def optimize_all_KNN(self): ICFKNN = self.optimize_single_KNN() UCFKNN = self.optimize_single_KNN() CBFKNN = self.optimize_single_KNN() return (ICFKNN, UCFKNN, CBFKNN) def optimize_slim(self): return { "topK": Integer(5, 1000), "epochs": Integer(20, 1500), "symmetric": Categorical([True, False]), "sgd_mode": Categorical(["sgd", "adagrad", "adam"]), "lambda_i": Real(low=1e-5, high=1e-2, prior='log-uniform'), "lambda_j": Real(low=1e-5, high=1e-2, prior='log-uniform'), "learning_rate": Real(low=1e-4, high=1e-1, prior='log-uniform') } def rebuild_slim(self, array): return { "topK": array[0], "epochs": array[1], "symmetric": array[2], "sgd_mode": array[3], "lambda_i": array[4], "lambda_j": array[5], "learning_rate": array[6] } def optimize_puresvd(self): return {"num_factors": Integer(5, 1000)} def rebuild_puresvd(self, array): return {"num_factors": array[0]} def optimize_als(self): return { "alpha_val": Real(low=0, high=2, prior='uniform'), "n_factors": Integer(5, 1000), "regularization": Real(low=1e-4, high=10, prior='log-uniform'), "iterations": Integer(5, 50) } def rebuild_als(self, array): return { "alpha_val": array[0], "n_factors": array[1], "regularization": array[2], "iterations": array[3] } def optimize_p3a(self): return { "topK": Integer(5, 800), "alpha": Real(low=0, high=2, prior='uniform'), "normalize_similarity": Categorical([True, False]) } def rebuild_p3a(self, array): return { "topK": array[0], "alpha": array[1], "normalize_similarity": array[2] } def optimize_rp3beta(self): return { "topK": Integer(5, 800), "alpha": Real(low=0, high=2, prior='uniform'), "beta": Real(low=0, high=2, prior='uniform'), "normalize_similarity": Categorical([True, False]) } def rebuild_rp3beta(self, array): return { "topK": array[0], "alpha": array[1], "beta": array[2], "normalize_similarity": array[3] } def evaluate(self, hyp): # print("NUMBER OF PARAMETERS ON evaluate():" + str(len(hyp))) self.recommender = WeightedHybrid(self.urm_train, self.icm, self.rebuild_single_KNN(hyp[0:7]), self.rebuild_single_KNN(hyp[7:14]), self.rebuild_single_KNN(hyp[14:21]), self.rebuild_slim(hyp[21:28]), self.rebuild_puresvd(hyp[28:29]), None, None, self.rebuild_p3a(hyp[29:32]), self.rebuild_rp3beta(hyp[32:36]), self.rebuild_weights(hyp[36:])) self.recommender.fit() result = evaluate_algorithm(self.urm_validation, self.recommender, at=10) return float(result["MAP"] * (-1)) def post_validation(self, hyp): self.recommender = WeightedHybrid(self.urm_post_validation, self.icm, self.rebuild_single_KNN(hyp[0:7]), self.rebuild_single_KNN(hyp[7:14]), self.rebuild_single_KNN(hyp[14:21]), self.rebuild_slim(hyp[21:28]), self.rebuild_puresvd(hyp[28:29]), None, None, self.rebuild_p3a(hyp[29:32]), self.rebuild_rp3beta(hyp[32:36]), self.rebuild_weights(hyp[36:])) self.recommender.fit() result = evaluate_algorithm(self.urm_test, self.recommender, at=10) self.writer.write_report("\n\n" + str(result), self.report_counter) def evaluate_single(self, hyp): self.recommender = WeightedHybrid(self.urm_train, self.icm, p_icfknn=None, p_ucfknn=None, p_cbfknn=None, p_slimbpr=None, p_puresvd=None, p_als=self.rebuild_als(hyp[0:]), p_cfw=None, p_p3a=None, p_rp3b=None, weights={"als": 1}) self.recommender.fit() result = evaluate_algorithm(self.urm_test, self.recommender, at=10) return float(result["MAP"] * (-1)) def run(self): self.HYP = {} self.HYP["p_icfknn"], self.HYP["p_ucfknn"], self.HYP[ "p_cbfknn"] = self.optimize_all_KNN() self.HYP["p_slimbpr"] = self.optimize_slim() self.HYP["p_puresvd"] = self.optimize_puresvd() # self.HYP["p_als"] = self.optimize_als() self.HYP["p_p3a"] = self.optimize_p3a() self.HYP["p_rp3b"] = self.optimize_rp3beta() self.HYP["weight"] = self.optimeze_weights() self.iterator_to_create_dimension(self.HYP) res = gp_minimize( self.evaluate, self.hyperparams_values, n_calls=70, n_random_starts=20, n_points=10000, # noise = 'gaussian', noise=1e-5, acq_func='gp_hedge', acq_optimizer='auto', random_state=None, verbose=True, n_restarts_optimizer=10, xi=0.01, kappa=1.96, x0=None, y0=None, n_jobs=-1) self.writer.write_report(str(res), self.report_counter) self.create_parameters(res["x"]) self.post_validation(res["x"]) def run_single(self): self.HYP["p_als"] = self.optimize_als() self.iterator_to_create_dimension(self.HYP) res = gp_minimize( self.evaluate_single, self.hyperparams_values, n_calls=70, n_random_starts=20, n_points=10000, # noise = 'gaussian', noise=1e-5, acq_func='gp_hedge', acq_optimizer='auto', random_state=None, verbose=True, n_restarts_optimizer=10, xi=0.01, kappa=1.96, x0=None, y0=None, n_jobs=-1) self.writer.write_report(str(res), self.report_counter) self.create_parameters(res["x"]) def iterator_to_create_dimension(self, to_iterate): skopt_types = [Real, Integer, Categorical] for name, hyperparam in to_iterate.items(): if any( isinstance(hyperparam, sko_type) for sko_type in skopt_types): self.hyperparams_names.append(name) self.hyperparams_values.append(hyperparam) self.hyperparams[name] = hyperparam elif isinstance(hyperparam, str) or isinstance( hyperparam, int) or isinstance(hyperparam, bool): self.hyperparams_single_value[name] = hyperparam elif isinstance(hyperparam, dict): self.iterator_to_create_dimension(to_iterate[name]) else: raise ValueError("Unexpected parameter type: {} - {}".format( str(name), str(hyperparam))) def create_parameters(self, hyp): self.report_counter = self.report_counter + 1 #self.writer.write_report("p_icfknn :" + str(self.rebuild_single_KNN(hyp[0:7]) ), self.report_counter) #self.writer.write_report("p_ucfknn :" + str(self.rebuild_single_KNN(hyp[7:14])), self.report_counter) #self.writer.write_report("p_cbfknn :" + str(self.rebuild_single_KNN(hyp[14:21])), self.report_counter) #self.writer.write_report("p_cbfknn :" + str(self.rebuild_single_KNN(hyp[0:7])), self.report_counter) #self.writer.write_report("p_slimbpr :" + str(self.rebuild_slim(hyp[21:28])), self.report_counter) #self.writer.write_report("p_puresvd :" + str(self.rebuild_puresvd(hyp[28:29])), self.report_counter) #self.writer.write_report("p_p3a :" + str(self.rebuild_p3a(hyp[29:32])), self.report_counter) #self.writer.write_report("p_rp3b :" + str(self.rebuild_rp3beta(hyp[32:36])), self.report_counter) #self.writer.write_report("weight :" + str(self.rebuild_weights(hyp[36:])), self.report_counter) self.writer.write_report("p_als :" + str(self.rebuild_als(hyp[0:])), self.report_counter)
class CustomExtractor: def __init__(self): self.extractor = Extractor self.writer = Writer() self.my_path = self.extractor.DATA_FILE_PATH + "xL_data/prova.txt" # Some useful variables self.urm = self.extractor.get_urm_all(self.extractor) self.ucm_age = self.extractor.get_ucm_age(self.extractor).tocsr() self.ucm_region = self.extractor.get_ucm_region(self.extractor).tocsr() self.icm_asset = self.get_icm_asset() self.icm_price = self.get_icm_price() self.icm_sub_cat = self.extractor.get_icm_subclass( self.extractor).tocsr() def create_validation_test_files(self, write_userf, write_itemf): import Utils.Split.split_train_validation_leave_k_out as loo urm = self.urm # Splitting into post-validation & testing in case of parameter tuning matrices = loo.split_train_leave_k_out_user_wise(urm, 1, False, True) urm_post_validation = matrices[0] self.my_path = self.extractor.DATA_FILE_PATH + "xL_data/post_validation.txt" self.urm = urm_post_validation self.create_general_file(write_userf, write_itemf) urm_test = matrices[1] self.my_path = self.extractor.DATA_FILE_PATH + "xL_data/test.txt" self.urm = urm_test self.create_general_file(write_userf, write_itemf) # Splitting the post-validation matrix in train & validation # (Problem of merging train and validation again at the end => loo twice) matrices_for_validation = loo.split_train_leave_k_out_user_wise( urm_post_validation, 1, False, True) urm_train = matrices_for_validation[0] self.my_path = self.extractor.DATA_FILE_PATH + "xL_data/train.txt" self.urm = urm_train self.create_general_file(write_userf, write_itemf) urm_validation = matrices_for_validation[1] self.my_path = self.extractor.DATA_FILE_PATH + "xL_data/validation.txt" self.urm = urm_validation self.create_general_file(write_userf, write_itemf) def create_general_file(self, write_userf, write_itemf): n_users, n_items = self.urm.shape from tqdm import tqdm for user in tqdm(range(0, n_users)): # Getting positive items positive_interactions = list(self.urm[user].indices) # Checking to_create = len(positive_interactions) if len(positive_interactions) == 0: to_create = 1 # Generating equal number of negative ones negative_interactions = [] for _ in range(0, to_create): value = np.random.randint(0, high=(n_items + 1), size=1)[0] while value in positive_interactions: value = np.random.randint(0, high=(n_items + 1), size=1)[0] negative_interactions.append(value) ''' File written: 0/1 if is positive interaction or not 0 --------- Interaction Layer 0 - User 1 - Item 1 --------- User Content Layer 0 - Age 1 - Region 1 2 - Region 2 2 --------- Item Content Layer 0 - Asset 1 - Price 2 - Sub Cat ''' # Writing stage for item in positive_interactions: # 0 --------- Interaction Layer row_to_write = "1 0:0:" + str(user) + " 0:1:" + str(item) # 1 --------- User Content Layer if write_userf: row_to_write += self.create_user_features(user) # 2 --------- Item Content Layer if write_itemf: row_to_write += self.create_item_features(item) row_to_write += "\n" self.writer.write_generic(self.my_path, row_to_write) for item in negative_interactions: # 0 --------- Interaction Layer row_to_write = "0 0:0:" + str(user) + " 0:1:" + str(item) # 1 --------- User Content Layer if write_userf: row_to_write += self.create_user_features(user) # 2 --------- Item Content Layer if write_itemf: row_to_write += self.create_item_features(item) row_to_write += "\n" self.writer.write_generic(self.my_path, row_to_write) def create_user_features(self, user): to_write = "" if not user >= self.ucm_age.shape[0]: for i in range(0, len(self.ucm_age[user].indices)): age = self.ucm_age[user].indices[i] to_write += " 1:0:" + str(age) if not user >= self.ucm_region.shape[0]: for i in range(0, len(self.ucm_region[user].indices)): region = self.ucm_region[user].indices[i] to_write += " 1:" + str(i + 1) + ":" + str(region) return to_write def create_item_features(self, item): to_write = "" if not item >= self.icm_asset.shape[0]: for i in range(0, len(self.icm_asset[item].data)): data = self.icm_asset[item].data[i] to_write += " 2:0:" + str(data) if not item >= self.icm_price.shape[0]: for i in range(0, len(self.icm_price[item].data)): data = self.icm_price[item].data[i] to_write += " 2:1:" + str(data) if not item >= self.icm_sub_cat.shape[0]: for i in range(0, len(self.icm_sub_cat[item].indices)): data = self.icm_sub_cat[item].indices[i] to_write += " 2:2:" + str(data) return to_write def get_icm_asset(self): # Composing the name file_name = self.extractor.DATA_FILE_PATH + "data_ICM_asset.csv" with open(file_name) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 items = [] assets = [] for line in csv_reader: if line_count != 0: items.append(int(line[0])) assets.append(float(line[2])) line_count += 1 cols = np.zeros(line_count - 1, dtype=int) return sps.coo_matrix((assets, (items, cols))).tocsr() def get_icm_price(self): # Composing the name file_name = self.extractor.DATA_FILE_PATH + "data_ICM_price.csv" with open(file_name) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 items = [] prices = [] for line in csv_reader: if line_count != 0: items.append(int(line[0])) prices.append(float(line[2])) line_count += 1 cols = np.zeros(line_count - 1, dtype=int) return sps.coo_matrix((prices, (items, cols))).tocsr()
item_price=False, item_subclass=False, ): features_adder = FeatureAdder(dataframe, self.group_length) if user_profile_length: features_adder.add_user_profile_length() if user_region1: features_adder.add_user_region1() if user_age: features_adder.add_user_age() if top_pop: features_adder.add_item_popularity() def retrieve_test_dataframe(self): file_name = self.DATA_FILE_PATH + self.TEST_FILE + str( self.group_length) + ".csv" return pd.read_csv(file_name) if __name__ == '__main__': xgb = XGBoostDataframe(20) users, items = xgb.get_user_and_item_lists() dataframe = xgb.build_base_dataframe(users, items) xgb.add_features(dataframe, **ARGS_FEATURES) writer = Writer() # writer.save_dataframe(dataframe, 20) print(dataframe)
ratingList = np.ones(Extractor().get_numb_interactions()) URM_all = extractor.get_interaction_matrix(extractor, False) warm_items_mask = np.ediff1d(URM_all.tocsc().indptr) > 0 warm_items = np.arange(URM_all.shape[1])[warm_items_mask] URM_all = URM_all[:, warm_items] warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > 0 warm_users = np.arange(URM_all.shape[0])[warm_users_mask] URM_all = URM_all[warm_users, :] URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8) recommender = UserCFKNNRecommender(URM_train) recommender.fit(shrink=0.0, topK=200) submissionID = 3 userList_unique = extractor.get_user_to_make_rec(extractor) writer = Writer() fields = ['playlist_id', 'track_ids'] writer.write_header(submissionID, fields) for user_id in userList_unique: writer.write(submissionID, user_id, recommender.recommend(user_id, at=10)) print("Done")
from OwnUtils.Extractor import Extractor from TopPopRecommender import TopPopRecommender from OwnUtils.Writer import Writer import OwnUtils.Evaluator as ev import numpy as np if __name__ == '__main__': # Function to launch all the others SUBMISSION_NUMBER = 1 # TO BE CHANGED MANUALLY field_names = ['playlist_id', 'track_ids'] users = Extractor.get_user_to_make_rec(Extractor) Writer.write_header(Writer, SUBMISSION_NUMBER, field_names) matricies = Extractor.get_train_test_matrix(Extractor) URM_train = matricies[0] URM_test = matricies[1] topPopRecommender_removeSeen = TopPopRecommender() topPopRecommender_removeSeen.fit(URM_train) unique_users = list(set(Extractor.get_interaction_users(Extractor, False))) ev.evaluate_algorithm(URM_test, topPopRecommender_removeSeen, unique_users, at=10) for user_id in users: recs = topPopRecommender_removeSeen.recommend(user_id, at=10) Writer.write(Writer, SUBMISSION_NUMBER, user_id, recs)