def __evaluate_on_test(self, recommender_instance): # Evaluate recommender and get results for the first cutoff result_dict, result_string = self.evaluator_test.evaluateRecommender(recommender_instance, self.recommender_constructor_dict) result_dict = result_dict[list(result_dict.keys())[0]] writeLog(self.ALGORITHM_NAME + ": Best result evaluated on URM_test. Config: {} - results:\n{}\n".format(self.best_solution_parameters, result_string), self.log_file) return result_dict
for x in only_sum_equal_1: print("-------------------") print("---weights = {}---".format(x)) print("-------------------") hybridRecommender_scores.fit(weights=x) # print("-------------------") # print("---Hybrid fitted---") # print("-------------------") print("-------------------") print("-Hybrid Evaluation-") print("-------------------") dict, _ = evaluator_test.evaluateRecommender(hybridRecommender_scores) writeLog("---weights = {}---".format(x), output_file) writeLog("--- Parameters : {} ".format(dict), output_file) print(dict) # target_data = pd.read_csv('data/target_playlists.csv') # # print("--------------------------") # print("------Recommendation------") # print("--------------------------") # ws.write_submission(target_data, , 'output/submission.csv', at=10) #print(hybrid_recommender.evaluateRecommendations(URM_test))
def run_multiprocess_search(self, paramether_dictionary_list, num_cases_max): # Te following function runs the search in parallel. As different configurations might have signifiantly divergent # runtime threads must be joined from the first to terminate and the objects might be big, therefore parallel.pool is not suitable num_cases_evaluated = 0 num_cases_started = 0 num_cases_active = 0 termination_sent = False process_list = [None] * self.parallelPoolSize queue_job_todo = Queue() queue_job_done = Queue() get_memory_threshold_reached_partial = partial( get_memory_threshold_reached, max_ram_occupied_perc=self.max_ram_occupied_perc) for current_process_index in range(self.parallelPoolSize): newProcess = multiprocessing.Process( target=process_worker, args=( queue_job_todo, queue_job_done, current_process_index, get_memory_threshold_reached_partial, )) process_list[current_process_index] = newProcess newProcess.start() newProcess = None print("Started process: {}".format(current_process_index)) memory_threshold_reached, memory_used_quota = get_memory_threshold_reached( self.max_ram_occupied_perc) while num_cases_evaluated < num_cases_max: # Create as many new jobs as needed # Stop: if the max number of paralle processes is reached or the max ram occupancy is reached # if no other cases to explore # If no termination sent and active == 0, start one otherwise everything stalls # WARNING: apparently the function "queue_job_todo.empty()" is not reliable while ( (num_cases_active < self.parallelPoolSize and not memory_threshold_reached) or ( num_cases_active == 0)) \ and not termination_sent: memory_threshold_reached, memory_used_quota = get_memory_threshold_reached( self.max_ram_occupied_perc) if memory_threshold_reached: writeLog( self.ALGORITHM_NAME + ": Memory threshold reached, occupied {:.4f} %\n". format(memory_used_quota), self.logFile) if num_cases_started < num_cases_max and not memory_threshold_reached: process_object = Process_object_data_and_evaluation( self.recommender_class, self.dictionary_input, paramether_dictionary_list[num_cases_started], self.ALGORITHM_NAME, self.URM_validation, self.evaluation_function) queue_job_todo.put(process_object) num_cases_started += 1 num_cases_active += 1 process_object = None gc.collect() if num_cases_started >= num_cases_max and not termination_sent: print("Termination sent") queue_job_todo.put(None) termination_sent = True gc.collect() # Read all completed jobs. WARNING: apparently the function "empty" is not reliable queue_job_done_is_empty = False while not queue_job_done_is_empty: try: process_object = queue_job_done.get_nowait() # self.update_on_new_result(process_object, num_cases_evaluated) num_cases_evaluated += 1 num_cases_active -= 1 process_object = None except Empty: queue_job_done_is_empty = True time.sleep(1) gc.collect() # print("num_cases_evaluated {}".format(num_cases_evaluated)) # print("Evaluated {}, started {}, active {}".format(num_cases_evaluated, num_cases_started, num_cases_active)) queue_job_todo.get() for current_process in process_list: # print("Waiting to Join {}".format(current_process)) current_process.join() print("Joined {}".format(current_process))
def update_on_new_result(self, process_object, num_cases_evaluated): paramether_dictionary_to_save = self.from_fit_params_to_saved_params_function( process_object.recommender, process_object.paramether_dictionary_to_evaluate) if process_object.exception is not None: writeLog( self.ALGORITHM_NAME + ": Exception for config {}: {}\n".format( self.model_counter, paramether_dictionary_to_save, str(process_object.exception)), self.logFile) return if process_object.result_dict is None: writeLog( self.ALGORITHM_NAME + ": Result is None for config {}\n".format( self.model_counter, paramether_dictionary_to_save), self.logFile) return self.model_counter += 1 # # Always save best model separately # if self.save_model == "all": # # print(self.ALGORITHM_NAME + ": Saving model in {}\n".format(self.output_root_path)) # # process_object.recommender.saveModel(self.output_root_path, # # file_name="_model_{}".format(self.model_counter)) # # # # pickle.dump(paramether_dictionary_to_save.copy(), # # open(self.output_root_path + "_parameters_{}".format(self.model_counter), "wb"), # # protocol=pickle.HIGHEST_PROTOCOL) # a = 1 # I DON'T WANT TO SAVE ANY MODEL if self.best_solution_val == None or self.best_solution_val < process_object.result_dict[ self.metric]: writeLog( self.ALGORITHM_NAME + ": New best config found. Config {}: {} - MAP results: {}\n". format(self.model_counter, paramether_dictionary_to_save, process_object.result_dict[self.metric]), self.logFile) pickle.dump(paramether_dictionary_to_save.copy(), open(self.output_root_path + "_best_parameters", "wb"), protocol=pickle.HIGHEST_PROTOCOL) self.best_solution_val = process_object.result_dict[self.metric] self.best_solution_parameters = paramether_dictionary_to_save.copy( ) dereference_recommender_attributes(self.best_solution_object) self.best_solution_object = process_object.recommender # Always save best model separately # if self.save_model != "no": # print(self.ALGORITHM_NAME + ": Saving model in {}\n".format(self.output_root_path)) # process_object.recommender.saveModel(self.output_root_path, file_name="_best_model") if self.URM_test is not None: self.evaluate_on_test(self.URM_test) else: writeLog( self.ALGORITHM_NAME + ": Config is suboptimal. Config {}: {} - MAP results: {}\n". format(self.model_counter, paramether_dictionary_to_save, process_object.result_dict[self.metric]), self.logFile) dereference_recommender_attributes(process_object.recommender) dump_garbage()
def search(self, dictionary_input, metric="map", n_cases=60, output_root_path=None, parallelPoolSize=6, parallelize=True, save_model="no", max_ram_occupied_perc=None): # Associate the params that will be returned by BayesianOpt object to those you want to save # E.g. with early stopping you know which is the optimal number of epochs only afterwards # but you might want to save it as well self.from_fit_params_to_saved_params = {} self.dictionary_input = dictionary_input.copy() self.output_root_path = output_root_path self.logFile = open( self.output_root_path + "_" + self.ALGORITHM_NAME + ".txt", "a") self.metric = metric self.model_counter = 0 if max_ram_occupied_perc is None: self.max_ram_occupied_perc = 0.7 else: # Try if current ram status is possible to read try: get_RAM_status() self.max_ram_occupied_perc = max_ram_occupied_perc except: writeLog( self.ALGORITHM_NAME + ": Unable to read RAM status, ignoring max RAM setting", self.logFile) self.max_ram_occupied_perc = None if save_model in ["no", "best", "all"]: # self.save_model = save_model a = 1 else: raise ValueError( self.ALGORITHM_NAME + ": save_model not recognized, acceptable values are: {}, given is {}" .format(["no", "best", "all"], save_model)) if parallelPoolSize is None: self.parallelPoolSize = 1 else: # self.parallelPoolSize = int(multiprocessing.cpu_count()/2) self.parallelPoolSize = parallelPoolSize self.best_solution_val = None self.best_solution_parameters = None self.best_solution_object = None paramether_dictionary_list = self.build_all_cases_to_evaluate(n_cases) # Randomize ordering of cases random.shuffle(paramether_dictionary_list) self.runSingleCase_partial = partial(self.runSingleCase, metric=metric) if parallelize: self.run_multiprocess_search(paramether_dictionary_list, n_cases) else: self.run_singleprocess_search(paramether_dictionary_list, n_cases) writeLog( self.ALGORITHM_NAME + ": Best config is: Config {}, {} value is {:.4f}\n".format( self.best_solution_parameters, metric, self.best_solution_val), self.logFile) return self.best_solution_parameters.copy()
def __objective_function(self, current_fit_parameters_values): current_fit_parameters = dict(zip(self.hyperparams_names, current_fit_parameters_values)) result_dict, _, recommender_instance = self.__evaluate(current_fit_parameters, self.evaluator_validation) current_result = - result_dict[self.metric_to_optimize] paramether_dictionary_to_save = self.from_fit_params_to_saved_params_function(recommender_instance, current_fit_parameters) self.from_fit_params_to_saved_params[frozenset(current_fit_parameters.items())] = paramether_dictionary_to_save # Always save best model separately if self.save_model == "all": print(self.ALGORITHM_NAME + ": Saving model in {}\n".format(self.output_root_path + self.output_file_name_root)) recommender_instance.saveModel(self.output_root_path, file_name = self.output_file_name_root + "_model_{}".format(self.model_counter)) pickle.dump(paramether_dictionary_to_save.copy(), open(self.output_root_path + self.output_file_name_root + "_parameters_{}".format(self.model_counter), "wb"), protocol=pickle.HIGHEST_PROTOCOL) if self.best_solution_val == None or self.best_solution_val < result_dict[self.metric_to_optimize]: writeLog("BayesianSearch: New best config found. Config {}: {} - results: {}\n".format(self.model_counter, paramether_dictionary_to_save, result_dict), self.log_file) pickle.dump(paramether_dictionary_to_save.copy(), open(self.output_root_path + self.output_file_name_root + "_best_parameters", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(result_dict.copy(), open(self.output_root_path + self.output_file_name_root + "_best_result_validation", "wb"), protocol=pickle.HIGHEST_PROTOCOL) self.best_solution_val = result_dict[self.metric_to_optimize] self.best_solution_parameters = paramether_dictionary_to_save.copy() if self.save_model != "no": print("BayesianSearch: Saving model in {}\n".format(self.output_root_path + self.output_file_name_root)) recommender_instance.saveModel(self.output_root_path, file_name = self.output_file_name_root + "_best_model") if self.evaluator_test is not None: result_dict_test = self.__evaluate_on_test(recommender_instance) pickle.dump(result_dict_test, open(self.output_root_path + self.output_file_name_root + "_best_result_test", "wb"), protocol=pickle.HIGHEST_PROTOCOL) else: writeLog("BayesianSearch: Config {} is suboptimal. Config: {} - results: {}\n".format(self.model_counter, paramether_dictionary_to_save, result_dict), self.log_file) self.model_counter += 1 return current_result
def search(self, dictionary_input, metric="map", n_cases=30, output_root_path=None, parallelPoolSize=2, parallelize=True, save_model="best"): # Associate the params that will be returned by BayesianOpt object to those you want to save # E.g. with early stopping you know which is the optimal number of epochs only afterwards # but you might want to save it as well self.from_fit_params_to_saved_params = {} self.dictionary_input = dictionary_input.copy() self.output_root_path = output_root_path self.logFile = open(self.output_root_path + "_BayesianSearch.txt", "a") self.metric = metric self.model_counter = 0 if save_model in ["no", "best", "all"]: self.save_model = save_model else: raise ValueError( self.ALGORITHM_NAME + ": save_model not recognized, acceptable values are: {}, given is {}" .format(["no", "best", "all"], save_model)) if not parallelize: self.parallelPoolSize = 1 else: self.parallelPoolSize = parallelPoolSize self.best_solution_val = None self.best_solution_parameters = None self.best_solution_object = None paramether_dictionary_list = self.build_all_cases_to_evaluate() self.runSingleCase_partial = partial(self.runSingleCase, metric=metric) if parallelize: self.run_multiprocess_search(paramether_dictionary_list, n_cases) else: self.run_singleprocess_search(paramether_dictionary_list, n_cases) writeLog( self.ALGORITHM_NAME + ": Best config is: Config {}, {} value is {:.4f}\n".format( self.best_solution_parameters, metric, self.best_solution_val), self.logFile) return self.best_solution_parameters.copy() # def runSingleCase(self, paramether_dictionary, dictionary, folderPath = None, namePrefix = None): # # try: # # # Create an object of the same class of the imput # # Passing the paramether as a dictionary # recommender = self.recommender_class(*dictionary[DictionaryKeys.CONSTRUCTOR_POSITIONAL_ARGS], # **dictionary[DictionaryKeys.CONSTRUCTOR_KEYWORD_ARGS]) # # # print("GridSearch: Testing config: {}".format(paramether_dictionary)) # # recommender.fit(*dictionary[DictionaryKeys.FIT_POSITIONAL_ARGS], # **dictionary[DictionaryKeys.FIT_KEYWORD_ARGS], # **paramether_dictionary) # # # paramether_dictionary_to_save = self.from_fit_params_to_saved_params_function(recommender, paramether_dictionary) # # # if folderPath != None: # recommender.saveModel(folderPath, namePrefix = namePrefix) # # # #return recommender.evaluateRecommendations(self.URM_validation, at=5, mode="sequential") # return self.evaluation_function(recommender, self.URM_validation, paramether_dictionary), paramether_dictionary_to_save # # # except Exception as e: # # print("GridSearch: Testing config: {} - Exception {}\n".format(paramether_dictionary, str(e))) # traceback.print_exc() # # return None # # # # def search(self, dictionary, metric ="map", logFile = None, parallelPoolSize = 2, parallelize = True, # folderPath = None, namePrefix = None): # # hyperparamethers_range_dictionary = dictionary[DictionaryKeys.FIT_RANGE_KEYWORD_ARGS] # # key_list = list(hyperparamethers_range_dictionary.keys()) # # # Unpack list ranges from hyperparamethers to validate onto # # * operator allows to transform a list of objects into positional arguments # test_cases = itertools.product(*hyperparamethers_range_dictionary.values()) # # paramether_dictionary_list = [] # # at_least_one_evaluation_done = False # # for current_case in test_cases: # # paramether_dictionary = {} # # for index in range(len(key_list)): # # paramether_dictionary[key_list[index]] = current_case[index] # # paramether_dictionary_list.append(paramether_dictionary) # # #results_test = self.runSingleCase(dictionary, paramether_dictionary, logFile) # # if len(paramether_dictionary_list) >= parallelPoolSize or not parallelize: # # at_least_one_evaluation_done = True # self.evaluateBlock(dictionary, paramether_dictionary_list, metric, logFile, parallelPoolSize, parallelize) # # # Reset paramether list for next block # paramether_dictionary_list = [] # # # if not at_least_one_evaluation_done: # # Test cases are less than number of parallel threads # at_least_one_evaluation_done = True # self.evaluateBlock(dictionary, paramether_dictionary_list, metric, logFile, parallelPoolSize, parallelize) # # # writeLog("GridSearch: Best config is: Config {}, {} value is {:.4f}\n".format(self.paramether_dictionary_best, metric, self.results_test_best[metric]), logFile) # # if folderPath != None: # # writeLog("BayesianSearch: Saving model in {}\n".format(folderPath), logFile) # self.runSingleCase(self.paramether_dictionary_best, metric, folderPath = folderPath, namePrefix = namePrefix) # # # return self.paramether_dictionary_best # # # # def evaluateBlock(self, dictionary, paramether_dictionary_list, metric, logFile, parallelPoolSize, parallelize): # # if parallelize: # # runSingleCase_partial = partial(self.runSingleCase, # dictionary=dictionary) # # pool = multiprocessing.Pool(processes=parallelPoolSize, maxtasksperchild=1) # resultList = pool.map(runSingleCase_partial, paramether_dictionary_list) # # pool.close() # # else: # resultList = self.runSingleCase(paramether_dictionary_list[0], dictionary) # resultList = [resultList] # # # for results_index in range(len(resultList)): # # results_test, paramether_dictionary_test = resultList[results_index] # # if results_test!=None: # # if metric not in self.results_test_best or results_test[metric] > self.results_test_best[metric]: # # self.results_test_best = results_test.copy() # self.paramether_dictionary_best = paramether_dictionary_test.copy() # # writeLog("GridSearch: New best config found. Config {}, {} value is {:.4f}\n".format(paramether_dictionary_test, metric, self.results_test_best[metric]), logFile) # # else: # writeLog("GridSearch: Config is suboptimal. Config {}, {} value is {:.4f}\n".format(paramether_dictionary_test, metric, results_test[metric]), logFile) # # # # # # # # if __name__ == '__main__': # # from MatrixFactorization.Cython.MF_BPR_Cython import MF_BPR_Cython # from data.NetflixEnhanced.NetflixEnhancedReader import NetflixEnhancedReader # # dataReader = NetflixEnhancedReader() # URM_train = dataReader.get_URM_train() # URM_test = dataReader.get_URM_test() # # logFile = open("BPR_MF_GridSearch.txt", "a") # # # gridSearch = GridSearch(MF_BPR_Cython, None, URM_test, None) # # # hyperparamethers_range_dictionary = {} # hyperparamethers_range_dictionary["num_factors"] = list(range(1, 51, 5)) # hyperparamethers_range_dictionary["epochs"] = list(range(1, 51, 10)) # hyperparamethers_range_dictionary["batch_size"] = list(range(1, 101, 50)) # hyperparamethers_range_dictionary["learning_rate"] = [1e-1, 1e-2, 1e-3, 1e-4] # # # # recommenderDictionary = {DictionaryKeys.CONSTRUCTOR_POSITIONAL_ARGS: [URM_train], # DictionaryKeys.CONSTRUCTOR_KEYWORD_ARGS: dict(), # DictionaryKeys.FIT_POSITIONAL_ARGS: dict(), # DictionaryKeys.FIT_KEYWORD_ARGS: dict(), # DictionaryKeys.FIT_RANGE_KEYWORD_ARGS: hyperparamethers_range_dictionary} # # best_paramethers = gridSearch.search(recommenderDictionary, logFile = logFile) # # print(best_paramethers)