def _load_previously_built_split_and_attributes(self, save_folder_path): """ Loads all URM and ICM :return: """ if self.allow_cold_users: allow_cold_users_file_name = "allow_cold_users" else: allow_cold_users_file_name = "only_warm_users" data_dict = pickle.load( open(save_folder_path + "URM_{}_fold_split_{}".format(self.n_folds, allow_cold_users_file_name), "rb")) for attrib_name in data_dict.keys(): self.__setattr__(attrib_name, data_dict[attrib_name]) for ICM_name in self.dataReader_object.get_loaded_ICM_names(): ICM_object = pickle.load(open(save_folder_path + "{}".format(ICM_name), "rb")) self.__setattr__(ICM_name, ICM_object) pickle.load(open(save_folder_path + "tokenToFeatureMapper_{}".format(ICM_name), "rb")) self.__setattr__("tokenToFeatureMapper_{}".format(ICM_name), ICM_object) # MAPPER MANAGING if self.allow_cold_users: allow_cold_users_suffix = "allow_cold_users" else: allow_cold_users_suffix = "only_warm_users" name_suffix = "_{}".format(allow_cold_users_suffix) dataIO = DataIO(folder_path=save_folder_path) self.SPLIT_GLOBAL_MAPPER_DICT = dataIO.load_data(file_name="split_mappers" + name_suffix)
def _set_search_attributes(self, recommender_input_args, metric_to_optimize, output_folder_path, output_file_name_root, resume_from_saved, save_metadata, n_cases): self.output_folder_path = output_folder_path self.output_file_name_root = output_file_name_root # If directory does not exist, create if not os.path.exists(self.output_folder_path): os.makedirs(self.output_folder_path) self.log_file = open( self.output_folder_path + self.output_file_name_root + "_{}.txt".format(self.ALGORITHM_NAME), "a") self.recommender_input_args_list = recommender_input_args self.metric_to_optimize = metric_to_optimize self.resume_from_saved = resume_from_saved self.save_metadata = save_metadata self.model_counter = 0 self._init_metadata_dict(n_cases=n_cases) if self.save_metadata: self.dataIO = DataIO(folder_path=self.output_folder_path)
def _save_dataset(self, save_folder_path): """ Saves all URM, ICM and UCM :param save_folder_path: :return: """ dataIO = DataIO(folder_path=save_folder_path) dataIO.save_data(data_dict_to_save=self._LOADED_GLOBAL_MAPPER_DICT, file_name="dataset_global_mappers") dataIO.save_data(data_dict_to_save=self._LOADED_URM_DICT, file_name="dataset_URM") if len(self.get_loaded_ICM_names()) > 0: dataIO.save_data(data_dict_to_save=self._LOADED_ICM_DICT, file_name="dataset_ICM") dataIO.save_data(data_dict_to_save=self._LOADED_ICM_MAPPER_DICT, file_name="dataset_ICM_mappers") if len(self.get_loaded_UCM_names()) > 0: dataIO.save_data(data_dict_to_save=self._LOADED_UCM_DICT, file_name="dataset_UCM") dataIO.save_data(data_dict_to_save=self._LOADED_UCM_MAPPER_DICT, file_name="dataset_UCM_mappers")
def _load_previously_built_split_and_attributes(self, save_folder_path): """ Loads all URM and ICM :return: """ if self.use_validation_set: validation_set_suffix = "use_validation_set" else: validation_set_suffix = "no_validation_set" if self.allow_cold_users: allow_cold_users_suffix = "allow_cold_users" else: allow_cold_users_suffix = "only_warm_users" name_suffix = "_{}_{}".format(allow_cold_users_suffix, validation_set_suffix) dataIO = DataIO(folder_path=save_folder_path) split_parameters_dict = dataIO.load_data(file_name="split_parameters" + name_suffix) for attrib_name in split_parameters_dict.keys(): self.__setattr__(attrib_name, split_parameters_dict[attrib_name]) self.SPLIT_GLOBAL_MAPPER_DICT = dataIO.load_data(file_name="split_mappers" + name_suffix) self.SPLIT_URM_DICT = dataIO.load_data(file_name="split_URM" + name_suffix) if len(self.dataReader_object.get_loaded_ICM_names()) > 0: self.SPLIT_ICM_DICT = dataIO.load_data(file_name="split_ICM" + name_suffix) self.SPLIT_ICM_MAPPER_DICT = dataIO.load_data(file_name="split_ICM_mappers" + name_suffix)
def save_model(self, folder_path, file_name=None): if file_name is None: file_name = self.RECOMMENDER_NAME self._print("Saving model in file '{}'".format(folder_path + file_name)) data_dict_to_save = {"item_pop": self.item_pop} dataIO = DataIO(folder_path=folder_path) dataIO.save_data(file_name=file_name, data_dict_to_save=data_dict_to_save) self._print("Saving complete")
def load_model(self, folder_path, file_name=None): if file_name is None: file_name = self.RECOMMENDER_NAME self._print("Loading model from file '{}'".format(folder_path + file_name)) dataIO = DataIO(folder_path=folder_path) data_dict = dataIO.load_data(file_name=file_name) for attrib_name in data_dict.keys(): self.__setattr__(attrib_name, data_dict[attrib_name]) self._print("Loading complete")
def _save_dataset(self, save_folder_path): dataIO = DataIO(folder_path=save_folder_path) dataIO.save_data(data_dict_to_save=self._LOADED_GLOBAL_MAPPER_DICT, file_name="dataset_global_mappers") dataIO.save_data(data_dict_to_save=self._LOADED_URM_DICT, file_name="dataset_URM") if len(self.get_loaded_ICM_names()) > 0: dataIO.save_data(data_dict_to_save=self._LOADED_ICM_DICT, file_name="dataset_ICM") dataIO.save_data(data_dict_to_save=self._LOADED_ICM_MAPPER_DICT, file_name="dataset_ICM_mappers")
def _load_from_saved_sparse_matrix(self, save_folder_path): """ Loads all URM and ICM :return: """ dataIO = DataIO(folder_path=save_folder_path) self._LOADED_GLOBAL_MAPPER_DICT = dataIO.load_data( file_name="dataset_global_mappers") self._LOADED_URM_DICT = dataIO.load_data(file_name="dataset_URM") if len(self.get_loaded_ICM_names()) > 0: self._LOADED_ICM_DICT = dataIO.load_data(file_name="dataset_ICM") self._LOADED_ICM_MAPPER_DICT = dataIO.load_data( file_name="dataset_ICM_mappers")
def _set_search_attributes(self, recommender_input_args, recommender_input_args_last_test, metric_to_optimize, output_folder_path, output_file_name_root, resume_from_saved, save_metadata, save_model, evaluate_on_test_each_best_solution, n_cases): if save_model not in self._SAVE_MODEL_VALUES: raise ValueError( "{}: parameter save_model must be in '{}', provided was '{}'.". format(self.ALGORITHM_NAME, self._SAVE_MODEL_VALUES, save_model)) self.output_folder_path = output_folder_path self.output_file_name_root = output_file_name_root # If directory does not exist, create if not os.path.exists(self.output_folder_path): os.makedirs(self.output_folder_path) self.log_file = open( self.output_folder_path + self.output_file_name_root + "_{}.txt".format(self.ALGORITHM_NAME), "a") if save_model == "last" and recommender_input_args_last_test is None: self._write_log( "{}: parameter save_model is 'last' but no recommender_input_args_last_test provided, saving best model on train data alone." .format(self.ALGORITHM_NAME)) save_model = "best" self.recommender_input_args = recommender_input_args self.recommender_input_args_last_test = recommender_input_args_last_test self.metric_to_optimize = metric_to_optimize self.save_model = save_model self.resume_from_saved = resume_from_saved self.save_metadata = save_metadata self.evaluate_on_test_each_best_solution = evaluate_on_test_each_best_solution self.model_counter = 0 self._init_metadata_dict(n_cases=n_cases) if self.save_metadata: self.dataIO = DataIO(folder_path=self.output_folder_path)
def _save_split(self, save_folder_path): if save_folder_path: if self.allow_cold_users: allow_cold_users_suffix = "allow_cold_users" else: allow_cold_users_suffix = "only_warm_users" if self.use_validation_set: validation_set_suffix = "use_validation_set" else: validation_set_suffix = "no_validation_set" name_suffix = "_{}_{}".format(allow_cold_users_suffix, validation_set_suffix) split_parameters_dict = { "k_out_value": self.k_out_value, "allow_cold_users": self.allow_cold_users } dataIO = DataIO(folder_path=save_folder_path) dataIO.save_data(data_dict_to_save=split_parameters_dict, file_name="split_parameters" + name_suffix) dataIO.save_data(data_dict_to_save=self.SPLIT_GLOBAL_MAPPER_DICT, file_name="split_mappers" + name_suffix) dataIO.save_data(data_dict_to_save=self.SPLIT_URM_DICT, file_name="split_URM" + name_suffix) if len(self.dataReader_object.get_loaded_ICM_names()) > 0: dataIO.save_data(data_dict_to_save=self.SPLIT_ICM_DICT, file_name="split_ICM" + name_suffix) dataIO.save_data(data_dict_to_save=self.SPLIT_ICM_MAPPER_DICT, file_name="split_ICM_mappers" + name_suffix)
def save_model(self, folder_path, file_name = None): if file_name is None: file_name = self.RECOMMENDER_NAME self._print("Saving model in file '{}'".format(folder_path + file_name)) data_dict_to_save = {"USER_factors": self.USER_factors, "ITEM_factors": self.ITEM_factors, "use_bias": self.use_bias, } if self.use_bias: data_dict_to_save["ITEM_bias"] = self.ITEM_bias data_dict_to_save["USER_bias"] = self.USER_bias data_dict_to_save["GLOBAL_bias"] = self.GLOBAL_bias dataIO = DataIO(folder_path=folder_path) dataIO.save_data(file_name=file_name, data_dict_to_save = data_dict_to_save) self._print("Saving complete")
def _load_from_saved_sparse_matrix(self, save_folder_path): """ Loads all URM, ICM and UCM :return: """ dataIO = DataIO(folder_path=save_folder_path) self._LOADED_GLOBAL_MAPPER_DICT = dataIO.load_data( file_name="dataset_global_mappers") self._LOADED_URM_DICT = dataIO.load_data(file_name="dataset_URM") if len(self.get_loaded_ICM_names()) > 0: self._LOADED_ICM_DICT = dataIO.load_data(file_name="dataset_ICM") self._LOADED_ICM_MAPPER_DICT = dataIO.load_data( file_name="dataset_ICM_mappers") if len(self.get_loaded_UCM_names()) > 0: self._LOADED_UCM_DICT = dataIO.load_data(file_name="dataset_UCM") self._LOADED_UCM_MAPPER_DICT = dataIO.load_data( file_name="dataset_UCM_mappers") print( "RecSys2019Reader: WARNING --> There is no verification in the consistency of UCMs" )
def _split_data_from_original_dataset(self, save_folder_path): self.dataReader_object.load_data() URM = self.dataReader_object.get_URM_all() # Managing data reader self.SPLIT_GLOBAL_MAPPER_DICT = {} for mapper_name, mapper_object in self.dataReader_object.get_loaded_Global_mappers().items(): self.SPLIT_GLOBAL_MAPPER_DICT[mapper_name] = mapper_object.copy() URM = sps.csr_matrix(URM) if not self.allow_cold_users: user_interactions = np.ediff1d(URM.indptr) user_to_preserve = user_interactions >= self.n_folds user_to_remove = np.logical_not(user_to_preserve) print( "DataSplitter_Warm: Removing {} of {} users because they have less interactions than the number of folds".format( URM.shape[0] - user_to_preserve.sum(), URM.shape[0])) URM = URM[user_to_preserve, :] self.SPLIT_GLOBAL_MAPPER_DICT["user_original_ID_to_index"] = reconcile_mapper_with_removed_tokens( self.SPLIT_GLOBAL_MAPPER_DICT["user_original_ID_to_index"], np.arange(0, len(user_to_remove), dtype=np.int)[user_to_remove]) self.n_users, self.n_items = URM.shape URM = sps.csr_matrix(URM) # Create empty URM for each fold self.fold_split = {} for fold_index in range(self.n_folds): self.fold_split[fold_index] = {} self.fold_split[fold_index]["URM"] = sps.coo_matrix(URM.shape) URM_fold_object = self.fold_split[fold_index]["URM"] # List.extend is waaaay faster than numpy.concatenate URM_fold_object.row = [] URM_fold_object.col = [] URM_fold_object.data = [] for user_id in range(self.n_users): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] user_profile = URM.indices[start_user_position:end_user_position] indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_profile = user_profile[indices_to_suffle] user_interactions = URM.data[start_user_position:end_user_position][indices_to_suffle] # interactions_per_fold is a float number, to auto-adjust fold size interactions_per_fold = len(user_profile) / self.n_folds for fold_index in range(self.n_folds): start_pos = int(interactions_per_fold * fold_index) end_pos = int(interactions_per_fold * (fold_index + 1)) if fold_index == self.n_folds - 1: end_pos = len(user_profile) current_fold_user_profile = user_profile[start_pos:end_pos] current_fold_user_interactions = user_interactions[start_pos:end_pos] URM_fold_object = self.fold_split[fold_index]["URM"] URM_fold_object.row.extend([user_id] * len(current_fold_user_profile)) URM_fold_object.col.extend(current_fold_user_profile) URM_fold_object.data.extend(current_fold_user_interactions) for fold_index in range(self.n_folds): URM_fold_object = self.fold_split[fold_index]["URM"] URM_fold_object.row = np.array(URM_fold_object.row, dtype=np.int) URM_fold_object.col = np.array(URM_fold_object.col, dtype=np.int) URM_fold_object.data = np.array(URM_fold_object.data, dtype=np.float) self.fold_split[fold_index]["URM"] = sps.csr_matrix(URM_fold_object) self.fold_split[fold_index]["items_in_fold"] = np.arange(0, self.n_items, dtype=np.int) fold_dict_to_save = {"fold_split": self.fold_split, "n_folds": self.n_folds, "n_items": self.n_items, "n_users": self.n_users, "allow_cold_users": self.allow_cold_users, } if self.allow_cold_users: allow_user = "******" else: allow_user = "******" pickle.dump(fold_dict_to_save, open(save_folder_path + "URM_{}_fold_split_{}".format(self.n_folds, allow_user), "wb"), protocol=pickle.HIGHEST_PROTOCOL) for ICM_name in self.dataReader_object.get_loaded_ICM_names(): pickle.dump(self.dataReader_object.get_ICM_from_name(ICM_name), open(save_folder_path + "{}".format(ICM_name), "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.dataReader_object.get_ICM_feature_to_index_mapper_from_name(ICM_name), open(save_folder_path + "tokenToFeatureMapper_{}".format(ICM_name), "wb"), protocol=pickle.HIGHEST_PROTOCOL) # MAPPER MANAGING if self.allow_cold_users: allow_cold_users_suffix = "allow_cold_users" else: allow_cold_users_suffix = "only_warm_users" name_suffix = "_{}".format(allow_cold_users_suffix) dataIO = DataIO(folder_path=save_folder_path) dataIO.save_data(data_dict_to_save=self.SPLIT_GLOBAL_MAPPER_DICT, file_name="split_mappers" + name_suffix) print("DataSplitter: Split complete")