示例#1
0
    def __evaluate_on_test(self, recommender_instance):

        # Evaluate recommender and get results for the first cutoff
        result_dict, result_string = self.evaluator_test.evaluateRecommender(recommender_instance, self.recommender_constructor_dict)
        result_dict = result_dict[list(result_dict.keys())[0]]

        writeLog(self.ALGORITHM_NAME + ": Best result evaluated on URM_test. Config: {} - results:\n{}\n".format(self.best_solution_parameters, result_string), self.log_file)

        return result_dict
示例#2
0
for x in only_sum_equal_1:
    print("-------------------")
    print("---weights = {}---".format(x))
    print("-------------------")
    hybridRecommender_scores.fit(weights=x)
    # print("-------------------")
    # print("---Hybrid fitted---")
    # print("-------------------")

    print("-------------------")
    print("-Hybrid Evaluation-")
    print("-------------------")

    dict, _ = evaluator_test.evaluateRecommender(hybridRecommender_scores)

    writeLog("---weights = {}---".format(x), output_file)
    writeLog("--- Parameters : {} ".format(dict), output_file)
    print(dict)





# target_data = pd.read_csv('data/target_playlists.csv')
#
# print("--------------------------")
# print("------Recommendation------")
# print("--------------------------")
# ws.write_submission(target_data, , 'output/submission.csv', at=10)

#print(hybrid_recommender.evaluateRecommendations(URM_test))
    def run_multiprocess_search(self, paramether_dictionary_list,
                                num_cases_max):

        # Te following function runs the search in parallel. As different configurations might have signifiantly divergent
        # runtime threads must be joined from the first to terminate and the objects might be big, therefore parallel.pool is not suitable

        num_cases_evaluated = 0
        num_cases_started = 0
        num_cases_active = 0
        termination_sent = False

        process_list = [None] * self.parallelPoolSize

        queue_job_todo = Queue()
        queue_job_done = Queue()

        get_memory_threshold_reached_partial = partial(
            get_memory_threshold_reached,
            max_ram_occupied_perc=self.max_ram_occupied_perc)

        for current_process_index in range(self.parallelPoolSize):
            newProcess = multiprocessing.Process(
                target=process_worker,
                args=(
                    queue_job_todo,
                    queue_job_done,
                    current_process_index,
                    get_memory_threshold_reached_partial,
                ))

            process_list[current_process_index] = newProcess

            newProcess.start()
            newProcess = None

            print("Started process: {}".format(current_process_index))

        memory_threshold_reached, memory_used_quota = get_memory_threshold_reached(
            self.max_ram_occupied_perc)

        while num_cases_evaluated < num_cases_max:

            # Create as many new jobs as needed
            # Stop:     if the max number of paralle processes is reached or the max ram occupancy is reached
            #           if no other cases to explore
            # If no termination sent and active == 0, start one otherwise everything stalls
            # WARNING: apparently the function "queue_job_todo.empty()" is not reliable
            while (
                        (num_cases_active < self.parallelPoolSize and not memory_threshold_reached) or (
                                num_cases_active == 0)) \
                    and not termination_sent:

                memory_threshold_reached, memory_used_quota = get_memory_threshold_reached(
                    self.max_ram_occupied_perc)

                if memory_threshold_reached:
                    writeLog(
                        self.ALGORITHM_NAME +
                        ": Memory threshold reached, occupied {:.4f} %\n".
                        format(memory_used_quota), self.logFile)

                if num_cases_started < num_cases_max and not memory_threshold_reached:
                    process_object = Process_object_data_and_evaluation(
                        self.recommender_class, self.dictionary_input,
                        paramether_dictionary_list[num_cases_started],
                        self.ALGORITHM_NAME, self.URM_validation,
                        self.evaluation_function)

                    queue_job_todo.put(process_object)
                    num_cases_started += 1
                    num_cases_active += 1
                    process_object = None
                    gc.collect()

                if num_cases_started >= num_cases_max and not termination_sent:
                    print("Termination sent")
                    queue_job_todo.put(None)
                    termination_sent = True
                gc.collect()

            # Read all completed jobs. WARNING: apparently the function "empty" is not reliable
            queue_job_done_is_empty = False

            while not queue_job_done_is_empty:

                try:
                    process_object = queue_job_done.get_nowait()

                    # self.update_on_new_result(process_object, num_cases_evaluated)
                    num_cases_evaluated += 1
                    num_cases_active -= 1
                    process_object = None

                except Empty:
                    queue_job_done_is_empty = True

            time.sleep(1)
            gc.collect()
            # print("num_cases_evaluated {}".format(num_cases_evaluated))

            # print("Evaluated {}, started {}, active {}".format(num_cases_evaluated, num_cases_started, num_cases_active))

        queue_job_todo.get()

        for current_process in process_list:
            # print("Waiting to Join {}".format(current_process))
            current_process.join()
            print("Joined {}".format(current_process))
    def update_on_new_result(self, process_object, num_cases_evaluated):

        paramether_dictionary_to_save = self.from_fit_params_to_saved_params_function(
            process_object.recommender,
            process_object.paramether_dictionary_to_evaluate)

        if process_object.exception is not None:
            writeLog(
                self.ALGORITHM_NAME + ": Exception for config {}: {}\n".format(
                    self.model_counter, paramether_dictionary_to_save,
                    str(process_object.exception)), self.logFile)

            return

        if process_object.result_dict is None:
            writeLog(
                self.ALGORITHM_NAME +
                ": Result is None for config {}\n".format(
                    self.model_counter, paramether_dictionary_to_save),
                self.logFile)

            return

        self.model_counter += 1

        # # Always save best model separately
        # if self.save_model == "all":
        #     # print(self.ALGORITHM_NAME + ": Saving model in {}\n".format(self.output_root_path))
        #     # process_object.recommender.saveModel(self.output_root_path,
        #     #                                      file_name="_model_{}".format(self.model_counter))
        #     #
        #     # pickle.dump(paramether_dictionary_to_save.copy(),
        #     #             open(self.output_root_path + "_parameters_{}".format(self.model_counter), "wb"),
        #     #             protocol=pickle.HIGHEST_PROTOCOL)
        #     a = 1 # I DON'T WANT TO SAVE ANY MODEL

        if self.best_solution_val == None or self.best_solution_val < process_object.result_dict[
                self.metric]:

            writeLog(
                self.ALGORITHM_NAME +
                ": New best config found. Config {}: {} - MAP results: {}\n".
                format(self.model_counter, paramether_dictionary_to_save,
                       process_object.result_dict[self.metric]), self.logFile)

            pickle.dump(paramether_dictionary_to_save.copy(),
                        open(self.output_root_path + "_best_parameters", "wb"),
                        protocol=pickle.HIGHEST_PROTOCOL)

            self.best_solution_val = process_object.result_dict[self.metric]
            self.best_solution_parameters = paramether_dictionary_to_save.copy(
            )

            dereference_recommender_attributes(self.best_solution_object)

            self.best_solution_object = process_object.recommender

            # Always save best model separately
            # if self.save_model != "no":
            #     print(self.ALGORITHM_NAME + ": Saving model in {}\n".format(self.output_root_path))
            #     process_object.recommender.saveModel(self.output_root_path, file_name="_best_model")

            if self.URM_test is not None:
                self.evaluate_on_test(self.URM_test)

        else:
            writeLog(
                self.ALGORITHM_NAME +
                ": Config is suboptimal. Config {}: {} - MAP results: {}\n".
                format(self.model_counter, paramether_dictionary_to_save,
                       process_object.result_dict[self.metric]), self.logFile)

            dereference_recommender_attributes(process_object.recommender)

        dump_garbage()
    def search(self,
               dictionary_input,
               metric="map",
               n_cases=60,
               output_root_path=None,
               parallelPoolSize=6,
               parallelize=True,
               save_model="no",
               max_ram_occupied_perc=None):

        # Associate the params that will be returned by BayesianOpt object to those you want to save
        # E.g. with early stopping you know which is the optimal number of epochs only afterwards
        # but you might want to save it as well
        self.from_fit_params_to_saved_params = {}

        self.dictionary_input = dictionary_input.copy()
        self.output_root_path = output_root_path
        self.logFile = open(
            self.output_root_path + "_" + self.ALGORITHM_NAME + ".txt", "a")
        self.metric = metric
        self.model_counter = 0

        if max_ram_occupied_perc is None:
            self.max_ram_occupied_perc = 0.7
        else:
            # Try if current ram status is possible to read
            try:
                get_RAM_status()
                self.max_ram_occupied_perc = max_ram_occupied_perc
            except:
                writeLog(
                    self.ALGORITHM_NAME +
                    ": Unable to read RAM status, ignoring max RAM setting",
                    self.logFile)
                self.max_ram_occupied_perc = None

        if save_model in ["no", "best", "all"]:
            # self.save_model = save_model
            a = 1
        else:
            raise ValueError(
                self.ALGORITHM_NAME +
                ": save_model not recognized, acceptable values are: {}, given is {}"
                .format(["no", "best", "all"], save_model))

        if parallelPoolSize is None:
            self.parallelPoolSize = 1
        else:
            # self.parallelPoolSize = int(multiprocessing.cpu_count()/2)
            self.parallelPoolSize = parallelPoolSize

        self.best_solution_val = None
        self.best_solution_parameters = None
        self.best_solution_object = None

        paramether_dictionary_list = self.build_all_cases_to_evaluate(n_cases)

        # Randomize ordering of cases
        random.shuffle(paramether_dictionary_list)

        self.runSingleCase_partial = partial(self.runSingleCase, metric=metric)

        if parallelize:
            self.run_multiprocess_search(paramether_dictionary_list, n_cases)
        else:
            self.run_singleprocess_search(paramether_dictionary_list, n_cases)

        writeLog(
            self.ALGORITHM_NAME +
            ": Best config is: Config {}, {} value is {:.4f}\n".format(
                self.best_solution_parameters, metric, self.best_solution_val),
            self.logFile)

        return self.best_solution_parameters.copy()
示例#6
0
    def __objective_function(self, current_fit_parameters_values):


        current_fit_parameters = dict(zip(self.hyperparams_names, current_fit_parameters_values))


        result_dict, _, recommender_instance = self.__evaluate(current_fit_parameters, self.evaluator_validation)

        current_result = - result_dict[self.metric_to_optimize]


        paramether_dictionary_to_save = self.from_fit_params_to_saved_params_function(recommender_instance, current_fit_parameters)

        self.from_fit_params_to_saved_params[frozenset(current_fit_parameters.items())] = paramether_dictionary_to_save





        # Always save best model separately
        if self.save_model == "all":

            print(self.ALGORITHM_NAME + ": Saving model in {}\n".format(self.output_root_path + self.output_file_name_root))

            recommender_instance.saveModel(self.output_root_path, file_name = self.output_file_name_root + "_model_{}".format(self.model_counter))

            pickle.dump(paramether_dictionary_to_save.copy(),
                        open(self.output_root_path + self.output_file_name_root + "_parameters_{}".format(self.model_counter), "wb"),
                        protocol=pickle.HIGHEST_PROTOCOL)



        if self.best_solution_val == None or self.best_solution_val < result_dict[self.metric_to_optimize]:

            writeLog("BayesianSearch: New best config found. Config {}: {} - results: {}\n".format(self.model_counter, paramether_dictionary_to_save, result_dict), self.log_file)

            pickle.dump(paramether_dictionary_to_save.copy(),
                        open(self.output_root_path + self.output_file_name_root + "_best_parameters", "wb"),
                        protocol=pickle.HIGHEST_PROTOCOL)

            pickle.dump(result_dict.copy(),
                        open(self.output_root_path + self.output_file_name_root + "_best_result_validation", "wb"),
                        protocol=pickle.HIGHEST_PROTOCOL)

            self.best_solution_val = result_dict[self.metric_to_optimize]
            self.best_solution_parameters = paramether_dictionary_to_save.copy()

            if self.save_model != "no":
                print("BayesianSearch: Saving model in {}\n".format(self.output_root_path + self.output_file_name_root))
                recommender_instance.saveModel(self.output_root_path, file_name = self.output_file_name_root + "_best_model")


            if self.evaluator_test is not None:
                result_dict_test = self.__evaluate_on_test(recommender_instance)

                pickle.dump(result_dict_test,
                            open(self.output_root_path + self.output_file_name_root + "_best_result_test", "wb"),
                            protocol=pickle.HIGHEST_PROTOCOL)


        else:
            writeLog("BayesianSearch: Config {} is suboptimal. Config: {} - results: {}\n".format(self.model_counter, paramether_dictionary_to_save, result_dict), self.log_file)



        self.model_counter += 1


        return current_result
    def search(self,
               dictionary_input,
               metric="map",
               n_cases=30,
               output_root_path=None,
               parallelPoolSize=2,
               parallelize=True,
               save_model="best"):

        # Associate the params that will be returned by BayesianOpt object to those you want to save
        # E.g. with early stopping you know which is the optimal number of epochs only afterwards
        # but you might want to save it as well
        self.from_fit_params_to_saved_params = {}

        self.dictionary_input = dictionary_input.copy()
        self.output_root_path = output_root_path
        self.logFile = open(self.output_root_path + "_BayesianSearch.txt", "a")
        self.metric = metric
        self.model_counter = 0

        if save_model in ["no", "best", "all"]:
            self.save_model = save_model
        else:
            raise ValueError(
                self.ALGORITHM_NAME +
                ": save_model not recognized, acceptable values are: {}, given is {}"
                .format(["no", "best", "all"], save_model))

        if not parallelize:
            self.parallelPoolSize = 1
        else:
            self.parallelPoolSize = parallelPoolSize

        self.best_solution_val = None
        self.best_solution_parameters = None
        self.best_solution_object = None

        paramether_dictionary_list = self.build_all_cases_to_evaluate()

        self.runSingleCase_partial = partial(self.runSingleCase, metric=metric)

        if parallelize:
            self.run_multiprocess_search(paramether_dictionary_list, n_cases)
        else:
            self.run_singleprocess_search(paramether_dictionary_list, n_cases)

        writeLog(
            self.ALGORITHM_NAME +
            ": Best config is: Config {}, {} value is {:.4f}\n".format(
                self.best_solution_parameters, metric, self.best_solution_val),
            self.logFile)

        return self.best_solution_parameters.copy()


#     def runSingleCase(self, paramether_dictionary, dictionary, folderPath = None, namePrefix = None):
#
#         try:
#
#             # Create an object of the same class of the imput
#             # Passing the paramether as a dictionary
#             recommender = self.recommender_class(*dictionary[DictionaryKeys.CONSTRUCTOR_POSITIONAL_ARGS],
#                                                  **dictionary[DictionaryKeys.CONSTRUCTOR_KEYWORD_ARGS])
#
#
#             print("GridSearch: Testing config: {}".format(paramether_dictionary))
#
#             recommender.fit(*dictionary[DictionaryKeys.FIT_POSITIONAL_ARGS],
#                             **dictionary[DictionaryKeys.FIT_KEYWORD_ARGS],
#                             **paramether_dictionary)
#
#
#             paramether_dictionary_to_save = self.from_fit_params_to_saved_params_function(recommender, paramether_dictionary)
#
#
#             if folderPath != None:
#                 recommender.saveModel(folderPath, namePrefix = namePrefix)
#
#
#             #return recommender.evaluateRecommendations(self.URM_validation, at=5, mode="sequential")
#             return self.evaluation_function(recommender, self.URM_validation, paramether_dictionary), paramether_dictionary_to_save
#
#
#         except Exception as e:
#
#             print("GridSearch: Testing config: {} - Exception {}\n".format(paramether_dictionary, str(e)))
#             traceback.print_exc()
#
#             return None
#
#
#
#     def search(self, dictionary, metric ="map", logFile = None, parallelPoolSize = 2, parallelize = True,
#                folderPath = None, namePrefix = None):
#
#         hyperparamethers_range_dictionary = dictionary[DictionaryKeys.FIT_RANGE_KEYWORD_ARGS]
#
#         key_list = list(hyperparamethers_range_dictionary.keys())
#
#         # Unpack list ranges from hyperparamethers to validate onto
#         # * operator allows to transform a list of objects into positional arguments
#         test_cases = itertools.product(*hyperparamethers_range_dictionary.values())
#
#         paramether_dictionary_list = []
#
#         at_least_one_evaluation_done = False
#
#         for current_case in test_cases:
#
#             paramether_dictionary = {}
#
#             for index in range(len(key_list)):
#
#                 paramether_dictionary[key_list[index]] = current_case[index]
#
#             paramether_dictionary_list.append(paramether_dictionary)
#
#             #results_test = self.runSingleCase(dictionary, paramether_dictionary, logFile)
#
#             if len(paramether_dictionary_list) >= parallelPoolSize or not parallelize:
#
#                 at_least_one_evaluation_done = True
#                 self.evaluateBlock(dictionary, paramether_dictionary_list, metric, logFile, parallelPoolSize, parallelize)
#
#                 # Reset paramether list for next block
#                 paramether_dictionary_list = []
#
#
#         if not at_least_one_evaluation_done:
#             # Test cases are less than number of parallel threads
#             at_least_one_evaluation_done = True
#             self.evaluateBlock(dictionary, paramether_dictionary_list, metric, logFile, parallelPoolSize, parallelize)
#
#
#         writeLog("GridSearch: Best config is: Config {}, {} value is {:.4f}\n".format(self.paramether_dictionary_best, metric, self.results_test_best[metric]), logFile)
#
#         if folderPath != None:
#
#             writeLog("BayesianSearch: Saving model in {}\n".format(folderPath), logFile)
#             self.runSingleCase(self.paramether_dictionary_best, metric, folderPath = folderPath, namePrefix = namePrefix)
#
#
#         return self.paramether_dictionary_best
#
#
#
#     def evaluateBlock(self, dictionary, paramether_dictionary_list, metric, logFile, parallelPoolSize, parallelize):
#
#         if parallelize:
#
#             runSingleCase_partial = partial(self.runSingleCase,
#                                             dictionary=dictionary)
#
#             pool = multiprocessing.Pool(processes=parallelPoolSize, maxtasksperchild=1)
#             resultList = pool.map(runSingleCase_partial, paramether_dictionary_list)
#
#             pool.close()
#
#         else:
#             resultList = self.runSingleCase(paramether_dictionary_list[0], dictionary)
#             resultList = [resultList]
#
#
#         for results_index in range(len(resultList)):
#
#             results_test, paramether_dictionary_test = resultList[results_index]
#
#             if results_test!=None:
#
#                 if metric not in self.results_test_best or results_test[metric] > self.results_test_best[metric]:
#
#                     self.results_test_best = results_test.copy()
#                     self.paramether_dictionary_best = paramether_dictionary_test.copy()
#
#                     writeLog("GridSearch: New best config found. Config {}, {} value is {:.4f}\n".format(paramether_dictionary_test, metric, self.results_test_best[metric]), logFile)
#
#                 else:
#                     writeLog("GridSearch: Config is suboptimal. Config {}, {} value is {:.4f}\n".format(paramether_dictionary_test, metric, results_test[metric]), logFile)
#
#
#
#
#
#
#
# if __name__ == '__main__':
#
#     from MatrixFactorization.Cython.MF_BPR_Cython import MF_BPR_Cython
#     from data.NetflixEnhanced.NetflixEnhancedReader import NetflixEnhancedReader
#
#     dataReader = NetflixEnhancedReader()
#     URM_train = dataReader.get_URM_train()
#     URM_test = dataReader.get_URM_test()
#
#     logFile = open("BPR_MF_GridSearch.txt", "a")
#
#
#     gridSearch = GridSearch(MF_BPR_Cython, None, URM_test, None)
#
#
#     hyperparamethers_range_dictionary = {}
#     hyperparamethers_range_dictionary["num_factors"] = list(range(1, 51, 5))
#     hyperparamethers_range_dictionary["epochs"] = list(range(1, 51, 10))
#     hyperparamethers_range_dictionary["batch_size"] = list(range(1, 101, 50))
#     hyperparamethers_range_dictionary["learning_rate"] = [1e-1, 1e-2, 1e-3, 1e-4]
#
#
#
#     recommenderDictionary = {DictionaryKeys.CONSTRUCTOR_POSITIONAL_ARGS: [URM_train],
#                              DictionaryKeys.CONSTRUCTOR_KEYWORD_ARGS: dict(),
#                              DictionaryKeys.FIT_POSITIONAL_ARGS: dict(),
#                              DictionaryKeys.FIT_KEYWORD_ARGS: dict(),
#                              DictionaryKeys.FIT_RANGE_KEYWORD_ARGS: hyperparamethers_range_dictionary}
#
#     best_paramethers = gridSearch.search(recommenderDictionary, logFile = logFile)
#
#     print(best_paramethers)