def __init__(self, year=2007, fold_id=0, exclude_qf=False, **kwargs): super(LetorRankingDatasetReader, self).__init__(dataset_folder="letor", **kwargs) self.DATASET_FOLDER_2007 = "MQ{}".format(year) self.DATASET_FOLDER_2008 = "MQ{}".format(year) if year not in [2007, 2008]: raise ValueError("year must be either 2007 or 2008") self.year = year self.exclude_qf = exclude_qf self.query_feature_indices = [4, 5, 6, 19, 20, 21, 34, 35, 36] self.query_document_feature_indices = np.delete( np.arange(0, 46), self.query_feature_indices) logger.info("For Year {} excluding query features {}".format( self.year, self.exclude_qf)) self.dataset_indices = np.arange(5) self.condition = np.zeros(5, dtype=bool) self.file_format = os.path.join(self.dirname, str(self.year), "S{}.h5") create_dir_recursively(self.file_format, is_file_path=True) for i in self.dataset_indices: h5py_file_pth = self.file_format.format(i + 1) self.condition[i] = os.path.isfile(h5py_file_pth) if not os.path.isfile(h5py_file_pth): logger.info("File {} not created".format(h5py_file_pth)) assert (fold_id in self.dataset_indices ), "For fold {} no test dataset present".format(fold_id + 1) logger.info("Test dataset is S{}".format(fold_id + 1)) self.fold_id = fold_id
def __init__(self, year=2007, fold_id=0, exclude_qf=False, **kwargs): super(LetorListwiseDatasetReader, self).__init__(dataset_folder='letor', **kwargs) self.DATASET_FOLDER_2007 = 'MQ{}-list'.format(year) self.DATASET_FOLDER_2008 = 'MQ{}-list'.format(year) self.logger = logging.getLogger(LetorListwiseDatasetReader.__name__) if year not in [2007, 2008]: self.year = 2007 else: self.year = year self.exclude_qf = exclude_qf self.query_feature_indices = [4, 5, 6, 19, 20, 21, 34, 35, 36] self.query_document_feature_indices = np.delete( np.arange(0, 46), self.query_feature_indices) self.logger.info("For Year {} excluding query features {}".format( self.year, self.exclude_qf)) self.dataset_indices = np.arange(5) self.condition = np.zeros(5, dtype=bool) self.file_format = os.path.join(self.dirname, str(self.year), "I{}.h5") create_dir_recursively(self.file_format, is_file_path=True) for i in self.dataset_indices: h5py_file_pth = self.file_format.format(i + 1) self.condition[i] = os.path.isfile(h5py_file_pth) if not os.path.isfile(h5py_file_pth): self.logger.info("File {} not created".format(h5py_file_pth)) assert fold_id in self.dataset_indices, "For fold {} no test dataset present".format( fold_id + 1) self.logger.info("Test dataset is I{}".format(fold_id + 1)) self.fold_id = fold_id self.__load_dataset__()
def configure_logging(): log_path = os.path.join(DIR_NAME, 'results', 'compiling_result.log') create_dir_recursively(log_path, True) log_path = rename_file_if_exist(log_path) global logger logging.basicConfig(filename=log_path, level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(name='Compiling Results')
def __init__(self, ranker_class, optimizer_path, ranker_params=None, fit_params=None, random_state=None, tuning_callbacks=None, validation_loss=None, learning_problem=OBJECT_RANKING, **kwd): self.logger = logging.getLogger(PARAMETER_OPTIMIZER) default_rankers = {OBJECT_RANKING: FATEObjectRanker, LABEL_RANKING: FATELabelRanker, DISCRETE_CHOICE: FATEObjectChooser, DYAD_RANKING: FATEContextualRanker} create_dir_recursively(optimizer_path, True) self.optimizer_path = optimizer_path if ranker_class is None: self._ranker_class = default_rankers[learning_problem] else: check_ranker_class(ranker_class) self._ranker_class = ranker_class if ranker_params is None: raise ValueError('Ranker parameters cannot be Empty') else: self._ranker_params = ranker_params if tuning_callbacks is None: self.tuning_callbacks = [] else: self.tuning_callbacks = tuning_callbacks default_validation_loss = {OBJECT_RANKING: zero_one_rank_loss, LABEL_RANKING: zero_one_rank_loss, DISCRETE_CHOICE: categorical_hinge, DYAD_RANKING: zero_one_rank_loss} if validation_loss is None: self.validation_loss = default_validation_loss[learning_problem] self.logger.info( 'Loss function is not specified, using {}'.format(default_validation_loss[learning_problem].__name__)) else: self.validation_loss = validation_loss if fit_params is None: self._fit_params = {} self.logger.warning("Fit params are empty, the default parameters will be applied") else: self._fit_params = fit_params self.random_state = check_random_state(random_state) self.model = None
def generate_concise_results_for_dataset(dataset='medoid', directory='logs_single_fold', result_directory='results'): ranker_names = list(ranker_options.keys()) ranker_names.sort() metric_names.sort() data = [] data.append(['**************', dataset.upper(), '**************', ""]) for ranker_name in ranker_names: try: log_path = os.path.join(DIR_NAME, directory, '{}_{}.log'.format(dataset, ranker_name)) lines = np.array([line.rstrip('\n') for line in open(log_path)]) except FileNotFoundError: logger.error('File {} is not found'.format(log_path)) data.append(['NE' for i in range(len(metric_names))]) continue one_row = [] for out in metric_names: try: matching = [s for s in lines if out in s][0] if out in matching: one_row.append(matching.split(out + ' : ')[-1]) except IndexError: logger.error( 'error {} in ranker {} is not evaluated for dataset {}'. format(out, ranker_name, dataset)) one_row.append('NE') data.append(one_row) columns = [name.upper() for name in metric_names] indexes = [name.upper() for name in ranker_names] indexes.insert(0, 'DATASET') dataFrame = pd.DataFrame(data, index=indexes, columns=columns) file_path = os.path.join(DIR_NAME, result_directory, '{}.csv'.format(dataset)) create_dir_recursively(file_path, True) dataFrame.to_csv(file_path) return dataFrame
file_name_format = '{}_{}' if cindex <= 0: folder = "{}_single_fold" result_folder = "single_cv_results" else: folder = "{}_multiple_folds" result_folder = "multiple_cv_results" file_name_format.format(file_name_format, cindex) log_path = os.path.join( DIR_PATH, folder.format('logs'), file_name_format.format(dataset_str, ranker_name) + '.log') random_state = np.random.RandomState(seed=seed) create_dir_recursively(log_path, True) # log_path = rename_file_if_exist(log_path) logger = configure_logging_numpy_keras(seed=random_state.randint(2**32), log_path=log_path) logger.debug(arguments) dataset_function_params['random_state'] = random_state ranker, dataset_reader = get_ranker_and_dataset_functions( ranker_name, dataset_name, dataset_function_params, problem) X_train, Y_train, X_test, Y_test = dataset_reader.get_single_train_test_split( ) n_features, n_objects = log_test_train_data(X_train, X_test, logger) ranker_params, fit_params, parameter_ranges = get_ranker_parameters( ranker_name, n_features, n_objects, dataset_name, dataset_function_params)
def __init__(self, ranker, optimizer_path, tunable_parameter_ranges, fit_params=None, random_state=None, tuning_callbacks=None, validation_loss=None, learning_problem=OBJECT_RANKING, **kwd): """ Parameters ---------- ranker : object The ranker object for which the hyper parameters needs to optimized. optimizer_path : string The path where the complete optimizer pickle object can be stored. tunable_parameter_ranges : dict Dictionary with keys as the object needs to be tuned. Values is the dictionary of tunable parameters with ranges in which they should be tuned. fit_params : dict The fitting parameters for the ranker random_state : int or object Numpy random state tuning_callbacks : list of object Callbacks for the parameter optimizer validation_loss : function Differentiable loss function for the ranker learning_problem : string The learning problem under which the ranker comes **kwd : Keyword arguments for the hidden units """ self.logger = logging.getLogger(PARAMETER_OPTIMIZER) create_dir_recursively(optimizer_path, True) self.optimizer_path = optimizer_path self._tunable_parameter_ranges = tunable_parameter_ranges check_ranker_class(ranker) self.ranker = ranker if tuning_callbacks is None: self.tuning_callbacks = [] else: self.tuning_callbacks = tuning_callbacks default_validation_loss = { OBJECT_RANKING: zero_one_rank_loss, LABEL_RANKING: zero_one_rank_loss, DISCRETE_CHOICE: categorical_hinge, DYAD_RANKING: zero_one_rank_loss } if validation_loss is None: self.validation_loss = default_validation_loss[learning_problem] self.logger.info('Loss function is not specified, using {}'.format( default_validation_loss[learning_problem].__name__)) else: self.validation_loss = validation_loss if fit_params is None: self._fit_params = {} self.logger.warning( "Fit params are empty, the default parameters will be applied") else: self._fit_params = fit_params self.random_state = check_random_state(random_state) self.model = None
learner_params = job_description["learner_params"] duration = job_description["duration"] hp_iters = int(job_description["hp_iters"]) hp_ranges = job_description["hp_ranges"] hp_fit_params = job_description["hp_fit_params"] learning_problem = job_description["learning_problem"] experiment_schema = job_description["experiment_schema"] experiment_table = job_description["experiment_table"] validation_loss = job_description["validation_loss"] hash_value = job_description["hash_value"] random_state = np.random.RandomState(seed=seed + fold_id) log_path = os.path.join(DIR_PATH, LOGS_FOLDER, "{}.log".format(hash_value)) optimizer_path = os.path.join(DIR_PATH, OPTIMIZER_FOLDER, "{}".format(hash_value)) create_dir_recursively(log_path, True) create_dir_recursively(optimizer_path, True) setup_logging(log_path=log_path) configure_numpy_keras(seed=seed) logger = logging.getLogger('Experiment') logger.info("DB config filePath {}".format(config_file_path)) logger.info("Arguments {}".format(arguments)) logger.info("Job Description {}".format( print_dictionary(job_description))) duration = get_duration_seconds(duration) dataset_params['random_state'] = random_state dataset_params['fold_id'] = fold_id dataset_reader = get_dataset_reader(dataset_name, dataset_params) X_train, Y_train, X_test, Y_test = dataset_reader.get_single_train_test_split( )
columns.append(ZERO_ONE_RANK_ACCURACY) for ranker_name in rankers_dict[OBJECT_RANKING].keys(): one_row = ["{}".format(ranker_name.upper())] df_path = os.path.join(DIR_PATH, "multiple_cv_results", ('{}_{}' + '.csv').format(dataset_name, ranker_name)) if os.path.isfile(df_path): df = pd.read_csv(df_path) df[ZERO_ONE_RANK_ACCURACY] = 1.0 - df["ZeroOneRankLossTies"] std = df.std(axis=0).values mean = df.mean(axis=0).values one_row.extend(["{:.3f}+-{:.3f}".format(m, s) for m, s in zip(mean, std)]) data.append(one_row) if (len(data) > 0): df_path = os.path.join(DIR_PATH, "multiple_cv_results", ('{}_{}' + '.csv').format(dataset_name, "dataset")) create_dir_recursively(df_path, is_file_path=True) dataFrame = pd.DataFrame(data, columns=columns) dataFrame.to_csv(df_path) del dataFrame["KendallsTau"] del dataFrame["ZeroOneRankLossTies"] del dataFrame["ZeroOneRankLoss"] print(dataFrame.to_latex()) del dataFrame df_paths = os.path.join(DIR_PATH, "single_cv_results", '*.csv') for file_path in glob.glob(df_paths): print("Dataset {}".format(file_path.split('/')[-1])) dataFrame = pd.read_csv(file_path) dataFrame = dataFrame.round(3) del dataFrame["KendallsTau"] del dataFrame["ZeroOneRankLoss"] if "ZeroOneAccuracy" in dataFrame:
ranker_name = (arguments['--ranker_name']) problem = arguments['--problem'] duration = arguments['--duration'] duration = get_duration_microsecond(duration) dataset_name, ranker_name = get_applicable_ranker_dataset( dataset_name=dataset_name, ranker_name=ranker_name, problem=problem) dataset_function_params, dataset_str = get_dataset_str( dataset_function_params, dataset_name) log_path = os.path.join(DIR_PATH, LOGS_FOLDER, (FILE_FORMAT + '.log').format( dataset_str, ranker_name)) create_dir_recursively(log_path, True) # log_path = rename_file_if_exist(log_path) random_state = np.random.RandomState(seed=seed) logger = configure_logging_numpy_keras(seed=random_state.randint(2**32), log_path=log_path) logger.debug(arguments) logger.debug("The duration {}".format(microsec_to_time(duration))) logger.debug("Started the experiment at {}".format(start)) dataset_function_params["random_state"] = random_state ranker, dataset_reader = get_ranker_and_dataset_functions( ranker_name, dataset_name, dataset_function_params, problem) X_train, Y_train, X_test, Y_test = dataset_reader.get_single_train_test_split( )