def save_model(self, folder_path, file_name=None): if file_name is None: file_name = self.RECOMMENDER_NAME self._print("Saving model in file '{}'".format(folder_path + file_name)) data_dict_to_save = {"item_pop": self.item_pop} dataIO = DataIO(folder_path=folder_path) dataIO.save_data(file_name=file_name, data_dict_to_save=data_dict_to_save) self._print("Saving complete")
def _save_split(self, save_folder_path): if save_folder_path: if self.allow_cold_users: allow_cold_users_suffix = "allow_cold_users" else: allow_cold_users_suffix = "only_warm_users" if self.use_validation_set: validation_set_suffix = "use_validation_set" else: validation_set_suffix = "no_validation_set" name_suffix = "_{}_{}".format(allow_cold_users_suffix, validation_set_suffix) split_parameters_dict = { "k_out_value": self.k_out_value, "allow_cold_users": self.allow_cold_users } dataIO = DataIO(folder_path=save_folder_path) dataIO.save_data(data_dict_to_save=split_parameters_dict, file_name="split_parameters" + name_suffix) dataIO.save_data(data_dict_to_save=self.SPLIT_GLOBAL_MAPPER_DICT, file_name="split_mappers" + name_suffix) dataIO.save_data(data_dict_to_save=self.SPLIT_URM_DICT, file_name="split_URM" + name_suffix) if len(self.dataReader_object.get_loaded_ICM_names()) > 0: dataIO.save_data(data_dict_to_save=self.SPLIT_ICM_DICT, file_name="split_ICM" + name_suffix) dataIO.save_data(data_dict_to_save=self.SPLIT_ICM_MAPPER_DICT, file_name="split_ICM_mappers" + name_suffix)
def save_model(self, folder_path, file_name = None): if file_name is None: file_name = self.RECOMMENDER_NAME self._print("Saving model in file '{}'".format(folder_path + file_name)) data_dict_to_save = {"USER_factors": self.USER_factors, "ITEM_factors": self.ITEM_factors, "use_bias": self.use_bias, } if self.use_bias: data_dict_to_save["ITEM_bias"] = self.ITEM_bias data_dict_to_save["USER_bias"] = self.USER_bias data_dict_to_save["GLOBAL_bias"] = self.GLOBAL_bias dataIO = DataIO(folder_path=folder_path) dataIO.save_data(file_name=file_name, data_dict_to_save = data_dict_to_save) self._print("Saving complete")
def _save_dataset(self, save_folder_path): dataIO = DataIO(folder_path=save_folder_path) dataIO.save_data(data_dict_to_save=self._LOADED_GLOBAL_MAPPER_DICT, file_name="dataset_global_mappers") dataIO.save_data(data_dict_to_save=self._LOADED_URM_DICT, file_name="dataset_URM") if len(self.get_loaded_ICM_names()) > 0: dataIO.save_data(data_dict_to_save=self._LOADED_ICM_DICT, file_name="dataset_ICM") dataIO.save_data(data_dict_to_save=self._LOADED_ICM_MAPPER_DICT, file_name="dataset_ICM_mappers")
class CrossSearchFixedObject(object): ALGORITHM_NAME = "CrossSearchFixedObject" # Value to be assigned to invalid configuration or if an Exception is raised INVALID_CONFIG_VALUE = np.finfo(np.float16).max def __init__(self, recommender_object_list, evaluator_validation_list: List[Evaluator] = None, verbose=True): super(CrossSearchFixedObject, self).__init__() self.recommender_object_list = recommender_object_list self.verbose = verbose self.log_file = None self.results_test_best = {} self.parameter_dictionary_best = {} self.evaluator_validation_list: List[Evaluator] = evaluator_validation_list def search(self, recommender_input_args_list, parameter_search_space, metric_to_optimize="MAP", n_cases=None, output_folder_path=None, output_file_name_root=None, parallelize=False, save_metadata=True, ): raise NotImplementedError("Function search not implemented for this class") def _set_search_attributes(self, recommender_input_args, metric_to_optimize, output_folder_path, output_file_name_root, resume_from_saved, save_metadata, n_cases): self.output_folder_path = output_folder_path self.output_file_name_root = output_file_name_root # If directory does not exist, create if not os.path.exists(self.output_folder_path): os.makedirs(self.output_folder_path) self.log_file = open( self.output_folder_path + self.output_file_name_root + "_{}.txt".format(self.ALGORITHM_NAME), "a") self.recommender_input_args_list = recommender_input_args self.metric_to_optimize = metric_to_optimize self.resume_from_saved = resume_from_saved self.save_metadata = save_metadata self.model_counter = 0 self._init_metadata_dict(n_cases=n_cases) if self.save_metadata: self.dataIO = DataIO(folder_path=self.output_folder_path) def _init_metadata_dict(self, n_cases): self.metadata_dict = {"algorithm_name_search": self.ALGORITHM_NAME, "algorithm_name_recommender": self.recommender_object_list[0].RECOMMENDER_NAME, "exception_list": [None] * n_cases, "hyperparameters_list": [None] * n_cases, "hyperparameters_best": None, "hyperparameters_best_index": None, "result_on_validation_list": [None] * n_cases, "result_on_validation_best": None, "std_result_on_validation_list": [None] * n_cases, "std_result_on_validation_best": None, "time_on_train_list": [None] * n_cases, "time_on_train_total": 0.0, "time_on_train_avg": 0.0, "time_on_validation_list": [None] * n_cases, "time_on_validation_total": 0.0, "time_on_validation_avg": 0.0, "time_on_test_list": [None] * n_cases, "time_on_test_total": 0.0, "time_on_test_avg": 0.0, "result_on_last": None, "time_on_last_train": None, "time_on_last_test": None, } def _print(self, string): if self.verbose: print(string) def _write_log(self, string): self._print(string) if self.log_file is not None: self.log_file.write(string) self.log_file.flush() def _fit_model(self, split_index, current_fit_parameters, recommender_input_args): start_time = time.time() # Construct a new recommender instance recommender_instance = self.recommender_object_list[split_index].copy() recommender_instance.fit(*recommender_input_args.FIT_POSITIONAL_ARGS, **recommender_input_args.FIT_KEYWORD_ARGS, **current_fit_parameters) train_time = time.time() - start_time return recommender_instance, train_time def _evaluate_on_validation(self, current_fit_parameters): result_dicts = [] evaluation_times = np.zeros(len(self.evaluator_validation_list)) train_times = np.zeros(len(self.evaluator_validation_list)) self._print("{}: Testing config: {}".format(self.ALGORITHM_NAME, current_fit_parameters)) for i, evaluator in enumerate(self.evaluator_validation_list): recommender_instance, train_times[i] = self._fit_model(i, current_fit_parameters, self.recommender_input_args_list[i]) start_time = time.time() # Evaluate recommender and get results for the first cutoff mean_result_dict, _ = self.evaluator_validation_list[i].evaluateRecommender(recommender_instance) mean_result_dict = mean_result_dict[list(mean_result_dict.keys())[0]] result_dicts.append(mean_result_dict) evaluation_times[i] = time.time() - start_time mean_result_dict, std_result_dict = compute_mean_std_result_dict(result_dicts) evaluation_time = evaluation_times.sum() / evaluation_times.size train_time = train_times.sum() / train_times.size result_string = get_result_string(mean_result_dict, std_result_dict, n_mean_decimals=6, n_std_decimals=4) return mean_result_dict, std_result_dict, result_string, train_time, evaluation_time def _objective_function(self, current_fit_parameters_dict): try: self.metadata_dict["hyperparameters_list"][self.model_counter] = current_fit_parameters_dict.copy() mean_result_dict, std_result_dict, result_string, train_time, evaluation_time = self._evaluate_on_validation( current_fit_parameters_dict) current_result = - mean_result_dict[self.metric_to_optimize] if self.metadata_dict["result_on_validation_best"] is None: new_best_config_found = True else: best_solution_val = self.metadata_dict["result_on_validation_best"][self.metric_to_optimize] new_best_config_found = best_solution_val < mean_result_dict[self.metric_to_optimize] if new_best_config_found: self._write_log("{}: New best config found. Config {}: {} - results: {}\n".format(self.ALGORITHM_NAME, self.model_counter, current_fit_parameters_dict, result_string)) else: self._write_log("{}: Config {} is suboptimal. Config: {} - results: {}\n".format(self.ALGORITHM_NAME, self.model_counter, current_fit_parameters_dict, result_string)) if current_result >= self.INVALID_CONFIG_VALUE: self._write_log( "{}: WARNING! Config {} returned a value equal or worse than the default value to be assigned to invalid configurations." " If no better valid configuration is found, this parameter search may produce an invalid result.\n") self.metadata_dict["result_on_validation_list"][self.model_counter] = mean_result_dict.copy() self.metadata_dict["std_result_on_validation_list"][self.model_counter] = std_result_dict.copy() self.metadata_dict["time_on_train_list"][self.model_counter] = train_time self.metadata_dict["time_on_validation_list"][self.model_counter] = evaluation_time self.metadata_dict["time_on_train_total"], self.metadata_dict["time_on_train_avg"] = \ _compute_avg_time_non_none_values(self.metadata_dict["time_on_train_list"]) self.metadata_dict["time_on_validation_total"], self.metadata_dict["time_on_validation_avg"] = \ _compute_avg_time_non_none_values(self.metadata_dict["time_on_validation_list"]) if new_best_config_found: self.metadata_dict["hyperparameters_best"] = current_fit_parameters_dict.copy() self.metadata_dict["hyperparameters_best_index"] = self.model_counter self.metadata_dict["result_on_validation_best"] = mean_result_dict.copy() self.metadata_dict["std_result_on_validation_best"] = std_result_dict.copy() except (KeyboardInterrupt, SystemExit) as e: # If getting a interrupt, terminate without saving the exception raise e except: # Catch any error: Exception, Tensorflow errors etc... traceback_string = traceback.format_exc() self._write_log("{}: Config {} Exception. Config: {} - Exception: {}\n".format(self.ALGORITHM_NAME, self.model_counter, current_fit_parameters_dict, traceback_string)) self.metadata_dict["exception_list"][self.model_counter] = traceback_string # Assign to this configuration the worst possible score # Being a minimization problem, set it to the max value of a float current_result = + self.INVALID_CONFIG_VALUE traceback.print_exc() if self.save_metadata: self.dataIO.save_data(data_dict_to_save=self.metadata_dict.copy(), file_name=self.output_file_name_root + "_metadata") self.model_counter += 1 return current_result
class SearchAbstractClass(object): ALGORITHM_NAME = "SearchAbstractClass" # Available values for the save_model attribute _SAVE_MODEL_VALUES = ["all", "best", "last", "no"] # Value to be assigned to invalid configuration or if an Exception is raised INVALID_CONFIG_VALUE = np.finfo(np.float16).max def __init__(self, recommender_class, evaluator_validation=None, evaluator_test=None, verbose=True): super(SearchAbstractClass, self).__init__() self.recommender_class = recommender_class self.verbose = verbose self.log_file = None self.results_test_best = {} self.parameter_dictionary_best = {} self.evaluator_validation = evaluator_validation if evaluator_test is None: self.evaluator_test = None else: self.evaluator_test = evaluator_test def search( self, recommender_input_args, parameter_search_space, metric_to_optimize="MAP", n_cases=None, output_folder_path=None, output_file_name_root=None, parallelize=False, save_model="best", evaluate_on_test_each_best_solution=True, save_metadata=True, ): raise NotImplementedError( "Function search not implemented for this class") def _set_search_attributes(self, recommender_input_args, recommender_input_args_last_test, metric_to_optimize, output_folder_path, output_file_name_root, resume_from_saved, save_metadata, save_model, evaluate_on_test_each_best_solution, n_cases): if save_model not in self._SAVE_MODEL_VALUES: raise ValueError( "{}: parameter save_model must be in '{}', provided was '{}'.". format(self.ALGORITHM_NAME, self._SAVE_MODEL_VALUES, save_model)) self.output_folder_path = output_folder_path self.output_file_name_root = output_file_name_root # If directory does not exist, create if not os.path.exists(self.output_folder_path): os.makedirs(self.output_folder_path) self.log_file = open( self.output_folder_path + self.output_file_name_root + "_{}.txt".format(self.ALGORITHM_NAME), "a") if save_model == "last" and recommender_input_args_last_test is None: self._write_log( "{}: parameter save_model is 'last' but no recommender_input_args_last_test provided, saving best model on train data alone." .format(self.ALGORITHM_NAME)) save_model = "best" self.recommender_input_args = recommender_input_args self.recommender_input_args_last_test = recommender_input_args_last_test self.metric_to_optimize = metric_to_optimize self.save_model = save_model self.resume_from_saved = resume_from_saved self.save_metadata = save_metadata self.evaluate_on_test_each_best_solution = evaluate_on_test_each_best_solution self.model_counter = 0 self._init_metadata_dict(n_cases=n_cases) if self.save_metadata: self.dataIO = DataIO(folder_path=self.output_folder_path) def _init_metadata_dict(self, n_cases): self.metadata_dict = { "algorithm_name_search": self.ALGORITHM_NAME, "algorithm_name_recommender": self.recommender_class.RECOMMENDER_NAME, "exception_list": [None] * n_cases, "hyperparameters_list": [None] * n_cases, "hyperparameters_best": None, "hyperparameters_best_index": None, "result_on_validation_list": [None] * n_cases, "result_on_validation_best": None, "result_on_test_list": [None] * n_cases, "result_on_test_best": None, "time_on_train_list": [None] * n_cases, "time_on_train_total": 0.0, "time_on_train_avg": 0.0, "time_on_validation_list": [None] * n_cases, "time_on_validation_total": 0.0, "time_on_validation_avg": 0.0, "time_on_test_list": [None] * n_cases, "time_on_test_total": 0.0, "time_on_test_avg": 0.0, "result_on_last": None, "time_on_last_train": None, "time_on_last_test": None, } def _print(self, string): if self.verbose: print(string) def _write_log(self, string): self._print(string) if self.log_file is not None: self.log_file.write(string) self.log_file.flush() def _fit_model(self, current_fit_parameters): start_time = time.time() # Construct a new recommender instance recommender_instance = self.recommender_class( *self.recommender_input_args.CONSTRUCTOR_POSITIONAL_ARGS, **self.recommender_input_args.CONSTRUCTOR_KEYWORD_ARGS) self._print("{}: Testing config: {}".format(self.ALGORITHM_NAME, current_fit_parameters)) recommender_instance.fit( *self.recommender_input_args.FIT_POSITIONAL_ARGS, **self.recommender_input_args.FIT_KEYWORD_ARGS, **current_fit_parameters) train_time = time.time() - start_time return recommender_instance, train_time def _evaluate_on_validation(self, current_fit_parameters): recommender_instance, train_time = self._fit_model( current_fit_parameters) start_time = time.time() # Evaluate recommender and get results for the first cutoff result_dict, _ = self.evaluator_validation.evaluateRecommender( recommender_instance) result_dict = result_dict[list(result_dict.keys())[0]] evaluation_time = time.time() - start_time result_string = get_result_string_evaluate_on_validation(result_dict, n_decimals=7) return result_dict, result_string, recommender_instance, train_time, evaluation_time def _evaluate_on_test(self, recommender_instance, current_fit_parameters_dict, print_log=True): start_time = time.time() # Evaluate recommender and get results for the first cutoff result_dict, result_string = self.evaluator_test.evaluateRecommender( recommender_instance) evaluation_test_time = time.time() - start_time if print_log: self._write_log( "{}: Best config evaluated with evaluator_test. Config: {} - results:\n{}\n" .format(self.ALGORITHM_NAME, current_fit_parameters_dict, result_string)) return result_dict, result_string, evaluation_test_time def _evaluate_on_test_with_data_last(self): start_time = time.time() # Construct a new recommender instance recommender_instance = self.recommender_class( *self.recommender_input_args_last_test.CONSTRUCTOR_POSITIONAL_ARGS, **self.recommender_input_args_last_test.CONSTRUCTOR_KEYWORD_ARGS) # Check if last was already evaluated if self.resume_from_saved: result_on_last_saved_flag = self.metadata_dict["result_on_last"] is not None and \ self.metadata_dict["time_on_last_train"] is not None and \ self.metadata_dict["time_on_last_test"] is not None if result_on_last_saved_flag: self._print( "{}: Resuming '{}'... Result on last already available.". format(self.ALGORITHM_NAME, self.output_file_name_root)) return self._print( "{}: Evaluation with constructor data for final test. Using best config: {}" .format(self.ALGORITHM_NAME, self.metadata_dict["hyperparameters_best"])) # Use the hyperparameters that have been saved assert self.metadata_dict[ "hyperparameters_best"] is not None, "{}: Best hyperparameters not available, the search might have failed.".format( self.ALGORITHM_NAME) fit_keyword_args = self.metadata_dict["hyperparameters_best"].copy() fit_keyword_args = { **fit_keyword_args, **self.recommender_input_args_last_test.FIT_KEYWORD_ARGS } recommender_instance.fit( *self.recommender_input_args_last_test.FIT_POSITIONAL_ARGS, **fit_keyword_args) train_time = time.time() - start_time result_dict_test, result_string, evaluation_test_time = self._evaluate_on_test( recommender_instance, fit_keyword_args, print_log=False) self._write_log( "{}: Best config evaluated with evaluator_test with constructor data for final test. Config: {} - results:\n{}\n" .format(self.ALGORITHM_NAME, self.metadata_dict["hyperparameters_best"], result_string)) self.metadata_dict["result_on_last"] = result_dict_test self.metadata_dict["time_on_last_train"] = train_time self.metadata_dict["time_on_last_test"] = evaluation_test_time if self.save_metadata: self.dataIO.save_data(data_dict_to_save=self.metadata_dict.copy(), file_name=self.output_file_name_root + "_metadata") if self.save_model in ["all", "best", "last"]: self._print("{}: Saving model in {}\n".format( self.ALGORITHM_NAME, self.output_folder_path + self.output_file_name_root)) recommender_instance.save_model( self.output_folder_path, file_name=self.output_file_name_root + "_best_model_last") def _objective_function(self, current_fit_parameters_dict): try: self.metadata_dict["hyperparameters_list"][ self.model_counter] = current_fit_parameters_dict.copy() result_dict, result_string, recommender_instance, train_time, evaluation_time = self._evaluate_on_validation( current_fit_parameters_dict) current_result = -result_dict[self.metric_to_optimize] # If the recommender uses Earlystopping, get the selected number of epochs if isinstance(recommender_instance, Incremental_Training_Early_Stopping): n_epochs_early_stopping_dict = recommender_instance.get_early_stopping_final_epochs_dict( ) current_fit_parameters_dict = current_fit_parameters_dict.copy( ) for epoch_label in n_epochs_early_stopping_dict.keys(): epoch_value = n_epochs_early_stopping_dict[epoch_label] current_fit_parameters_dict[epoch_label] = epoch_value # Always save best model separately if self.save_model in ["all"]: self._print("{}: Saving model in {}\n".format( self.ALGORITHM_NAME, self.output_folder_path + self.output_file_name_root)) recommender_instance.save_model( self.output_folder_path, file_name=self.output_file_name_root + "_model_{}".format(self.model_counter)) if self.metadata_dict["result_on_validation_best"] is None: new_best_config_found = True else: best_solution_val = self.metadata_dict[ "result_on_validation_best"][self.metric_to_optimize] new_best_config_found = best_solution_val < result_dict[ self.metric_to_optimize] if new_best_config_found: self._write_log( "{}: New best config found. Config {}: {} - results: {}\n". format(self.ALGORITHM_NAME, self.model_counter, current_fit_parameters_dict, result_string)) if self.save_model in ["all", "best"]: self._print("{}: Saving model in {}\n".format( self.ALGORITHM_NAME, self.output_folder_path + self.output_file_name_root)) recommender_instance.save_model( self.output_folder_path, file_name=self.output_file_name_root + "_best_model") if self.evaluator_test is not None and self.evaluate_on_test_each_best_solution: result_dict_test, _, evaluation_test_time = self._evaluate_on_test( recommender_instance, current_fit_parameters_dict, print_log=True) else: self._write_log( "{}: Config {} is suboptimal. Config: {} - results: {}\n". format(self.ALGORITHM_NAME, self.model_counter, current_fit_parameters_dict, result_string)) if current_result >= self.INVALID_CONFIG_VALUE: self._write_log( "{}: WARNING! Config {} returned a value equal or worse than the default value to be assigned to invalid configurations." " If no better valid configuration is found, this parameter search may produce an invalid result.\n" ) self.metadata_dict["result_on_validation_list"][ self.model_counter] = result_dict.copy() self.metadata_dict["time_on_train_list"][ self.model_counter] = train_time self.metadata_dict["time_on_validation_list"][ self.model_counter] = evaluation_time self.metadata_dict["time_on_train_total"], self.metadata_dict["time_on_train_avg"] = \ _compute_avg_time_non_none_values(self.metadata_dict["time_on_train_list"]) self.metadata_dict["time_on_validation_total"], self.metadata_dict["time_on_validation_avg"] = \ _compute_avg_time_non_none_values(self.metadata_dict["time_on_validation_list"]) if new_best_config_found: self.metadata_dict[ "hyperparameters_best"] = current_fit_parameters_dict.copy( ) self.metadata_dict[ "hyperparameters_best_index"] = self.model_counter self.metadata_dict[ "result_on_validation_best"] = result_dict.copy() if self.evaluator_test is not None and self.evaluate_on_test_each_best_solution: self.metadata_dict[ "result_on_test_best"] = result_dict_test.copy() self.metadata_dict["result_on_test_list"][ self.model_counter] = result_dict_test.copy() self.metadata_dict["time_on_test_list"][ self.model_counter] = evaluation_test_time self.metadata_dict["time_on_test_total"], self.metadata_dict["time_on_test_avg"] = \ _compute_avg_time_non_none_values(self.metadata_dict["time_on_test_list"]) except (KeyboardInterrupt, SystemExit) as e: # If getting a interrupt, terminate without saving the exception raise e except: # Catch any error: Exception, Tensorflow errors etc... traceback_string = traceback.format_exc() self._write_log( "{}: Config {} Exception. Config: {} - Exception: {}\n".format( self.ALGORITHM_NAME, self.model_counter, current_fit_parameters_dict, traceback_string)) self.metadata_dict["exception_list"][ self.model_counter] = traceback_string # Assign to this configuration the worst possible score # Being a minimization problem, set it to the max value of a float current_result = +self.INVALID_CONFIG_VALUE traceback.print_exc() if self.save_metadata: self.dataIO.save_data(data_dict_to_save=self.metadata_dict.copy(), file_name=self.output_file_name_root + "_metadata") self.model_counter += 1 return current_result
def _split_data_from_original_dataset(self, save_folder_path): self.dataReader_object.load_data() URM = self.dataReader_object.get_URM_all() # Managing data reader self.SPLIT_GLOBAL_MAPPER_DICT = {} for mapper_name, mapper_object in self.dataReader_object.get_loaded_Global_mappers().items(): self.SPLIT_GLOBAL_MAPPER_DICT[mapper_name] = mapper_object.copy() URM = sps.csr_matrix(URM) if not self.allow_cold_users: user_interactions = np.ediff1d(URM.indptr) user_to_preserve = user_interactions >= self.n_folds user_to_remove = np.logical_not(user_to_preserve) print( "DataSplitter_Warm: Removing {} of {} users because they have less interactions than the number of folds".format( URM.shape[0] - user_to_preserve.sum(), URM.shape[0])) URM = URM[user_to_preserve, :] self.SPLIT_GLOBAL_MAPPER_DICT["user_original_ID_to_index"] = reconcile_mapper_with_removed_tokens( self.SPLIT_GLOBAL_MAPPER_DICT["user_original_ID_to_index"], np.arange(0, len(user_to_remove), dtype=np.int)[user_to_remove]) self.n_users, self.n_items = URM.shape URM = sps.csr_matrix(URM) # Create empty URM for each fold self.fold_split = {} for fold_index in range(self.n_folds): self.fold_split[fold_index] = {} self.fold_split[fold_index]["URM"] = sps.coo_matrix(URM.shape) URM_fold_object = self.fold_split[fold_index]["URM"] # List.extend is waaaay faster than numpy.concatenate URM_fold_object.row = [] URM_fold_object.col = [] URM_fold_object.data = [] for user_id in range(self.n_users): start_user_position = URM.indptr[user_id] end_user_position = URM.indptr[user_id + 1] user_profile = URM.indices[start_user_position:end_user_position] indices_to_suffle = np.arange(len(user_profile), dtype=np.int) np.random.shuffle(indices_to_suffle) user_profile = user_profile[indices_to_suffle] user_interactions = URM.data[start_user_position:end_user_position][indices_to_suffle] # interactions_per_fold is a float number, to auto-adjust fold size interactions_per_fold = len(user_profile) / self.n_folds for fold_index in range(self.n_folds): start_pos = int(interactions_per_fold * fold_index) end_pos = int(interactions_per_fold * (fold_index + 1)) if fold_index == self.n_folds - 1: end_pos = len(user_profile) current_fold_user_profile = user_profile[start_pos:end_pos] current_fold_user_interactions = user_interactions[start_pos:end_pos] URM_fold_object = self.fold_split[fold_index]["URM"] URM_fold_object.row.extend([user_id] * len(current_fold_user_profile)) URM_fold_object.col.extend(current_fold_user_profile) URM_fold_object.data.extend(current_fold_user_interactions) for fold_index in range(self.n_folds): URM_fold_object = self.fold_split[fold_index]["URM"] URM_fold_object.row = np.array(URM_fold_object.row, dtype=np.int) URM_fold_object.col = np.array(URM_fold_object.col, dtype=np.int) URM_fold_object.data = np.array(URM_fold_object.data, dtype=np.float) self.fold_split[fold_index]["URM"] = sps.csr_matrix(URM_fold_object) self.fold_split[fold_index]["items_in_fold"] = np.arange(0, self.n_items, dtype=np.int) fold_dict_to_save = {"fold_split": self.fold_split, "n_folds": self.n_folds, "n_items": self.n_items, "n_users": self.n_users, "allow_cold_users": self.allow_cold_users, } if self.allow_cold_users: allow_user = "******" else: allow_user = "******" pickle.dump(fold_dict_to_save, open(save_folder_path + "URM_{}_fold_split_{}".format(self.n_folds, allow_user), "wb"), protocol=pickle.HIGHEST_PROTOCOL) for ICM_name in self.dataReader_object.get_loaded_ICM_names(): pickle.dump(self.dataReader_object.get_ICM_from_name(ICM_name), open(save_folder_path + "{}".format(ICM_name), "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.dataReader_object.get_ICM_feature_to_index_mapper_from_name(ICM_name), open(save_folder_path + "tokenToFeatureMapper_{}".format(ICM_name), "wb"), protocol=pickle.HIGHEST_PROTOCOL) # MAPPER MANAGING if self.allow_cold_users: allow_cold_users_suffix = "allow_cold_users" else: allow_cold_users_suffix = "only_warm_users" name_suffix = "_{}".format(allow_cold_users_suffix) dataIO = DataIO(folder_path=save_folder_path) dataIO.save_data(data_dict_to_save=self.SPLIT_GLOBAL_MAPPER_DICT, file_name="split_mappers" + name_suffix) print("DataSplitter: Split complete")
def _save_dataset(self, save_folder_path): """ Saves all URM, ICM and UCM :param save_folder_path: :return: """ dataIO = DataIO(folder_path=save_folder_path) dataIO.save_data(data_dict_to_save=self._LOADED_GLOBAL_MAPPER_DICT, file_name="dataset_global_mappers") dataIO.save_data(data_dict_to_save=self._LOADED_URM_DICT, file_name="dataset_URM") if len(self.get_loaded_ICM_names()) > 0: dataIO.save_data(data_dict_to_save=self._LOADED_ICM_DICT, file_name="dataset_ICM") dataIO.save_data(data_dict_to_save=self._LOADED_ICM_MAPPER_DICT, file_name="dataset_ICM_mappers") if len(self.get_loaded_UCM_names()) > 0: dataIO.save_data(data_dict_to_save=self._LOADED_UCM_DICT, file_name="dataset_UCM") dataIO.save_data(data_dict_to_save=self._LOADED_UCM_MAPPER_DICT, file_name="dataset_UCM_mappers")