示例#1
0
    def __init__(self, year=2007, fold_id=0, exclude_qf=False, **kwargs):
        super(LetorRankingDatasetReader, self).__init__(dataset_folder="letor",
                                                        **kwargs)
        self.DATASET_FOLDER_2007 = "MQ{}".format(year)
        self.DATASET_FOLDER_2008 = "MQ{}".format(year)

        if year not in [2007, 2008]:
            raise ValueError("year must be either 2007 or 2008")
        self.year = year
        self.exclude_qf = exclude_qf
        self.query_feature_indices = [4, 5, 6, 19, 20, 21, 34, 35, 36]
        self.query_document_feature_indices = np.delete(
            np.arange(0, 46), self.query_feature_indices)
        logger.info("For Year {} excluding query features {}".format(
            self.year, self.exclude_qf))

        self.dataset_indices = np.arange(5)
        self.condition = np.zeros(5, dtype=bool)
        self.file_format = os.path.join(self.dirname, str(self.year), "S{}.h5")
        create_dir_recursively(self.file_format, is_file_path=True)

        for i in self.dataset_indices:
            h5py_file_pth = self.file_format.format(i + 1)
            self.condition[i] = os.path.isfile(h5py_file_pth)
            if not os.path.isfile(h5py_file_pth):
                logger.info("File {} not created".format(h5py_file_pth))
        assert (fold_id in self.dataset_indices
                ), "For fold {} no test dataset present".format(fold_id + 1)
        logger.info("Test dataset is S{}".format(fold_id + 1))
        self.fold_id = fold_id
    def __init__(self, year=2007, fold_id=0, exclude_qf=False, **kwargs):
        super(LetorListwiseDatasetReader,
              self).__init__(dataset_folder='letor', **kwargs)
        self.DATASET_FOLDER_2007 = 'MQ{}-list'.format(year)
        self.DATASET_FOLDER_2008 = 'MQ{}-list'.format(year)
        self.logger = logging.getLogger(LetorListwiseDatasetReader.__name__)

        if year not in [2007, 2008]:
            self.year = 2007
        else:
            self.year = year
        self.exclude_qf = exclude_qf
        self.query_feature_indices = [4, 5, 6, 19, 20, 21, 34, 35, 36]
        self.query_document_feature_indices = np.delete(
            np.arange(0, 46), self.query_feature_indices)
        self.logger.info("For Year {} excluding query features {}".format(
            self.year, self.exclude_qf))

        self.dataset_indices = np.arange(5)
        self.condition = np.zeros(5, dtype=bool)
        self.file_format = os.path.join(self.dirname, str(self.year), "I{}.h5")
        create_dir_recursively(self.file_format, is_file_path=True)

        for i in self.dataset_indices:
            h5py_file_pth = self.file_format.format(i + 1)
            self.condition[i] = os.path.isfile(h5py_file_pth)
            if not os.path.isfile(h5py_file_pth):
                self.logger.info("File {} not created".format(h5py_file_pth))
        assert fold_id in self.dataset_indices, "For fold {} no test dataset present".format(
            fold_id + 1)
        self.logger.info("Test dataset is I{}".format(fold_id + 1))
        self.fold_id = fold_id
        self.__load_dataset__()
示例#3
0
def configure_logging():
    log_path = os.path.join(DIR_NAME, 'results', 'compiling_result.log')
    create_dir_recursively(log_path, True)
    log_path = rename_file_if_exist(log_path)
    global logger
    logging.basicConfig(filename=log_path,
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    logger = logging.getLogger(name='Compiling Results')
示例#4
0
    def __init__(self, ranker_class, optimizer_path, ranker_params=None, fit_params=None,
                 random_state=None, tuning_callbacks=None, validation_loss=None, learning_problem=OBJECT_RANKING,
                 **kwd):
        self.logger = logging.getLogger(PARAMETER_OPTIMIZER)

        default_rankers = {OBJECT_RANKING: FATEObjectRanker, LABEL_RANKING: FATELabelRanker,
                           DISCRETE_CHOICE: FATEObjectChooser, DYAD_RANKING: FATEContextualRanker}
        create_dir_recursively(optimizer_path, True)
        self.optimizer_path = optimizer_path

        if ranker_class is None:
            self._ranker_class = default_rankers[learning_problem]
        else:
            check_ranker_class(ranker_class)
            self._ranker_class = ranker_class

        if ranker_params is None:
            raise ValueError('Ranker parameters cannot be Empty')
        else:
            self._ranker_params = ranker_params

        if tuning_callbacks is None:
            self.tuning_callbacks = []
        else:
            self.tuning_callbacks = tuning_callbacks
        default_validation_loss = {OBJECT_RANKING: zero_one_rank_loss, LABEL_RANKING: zero_one_rank_loss,
                                   DISCRETE_CHOICE: categorical_hinge, DYAD_RANKING: zero_one_rank_loss}
        if validation_loss is None:
            self.validation_loss = default_validation_loss[learning_problem]
            self.logger.info(
                'Loss function is not specified, using {}'.format(default_validation_loss[learning_problem].__name__))
        else:
            self.validation_loss = validation_loss

        if fit_params is None:
            self._fit_params = {}
            self.logger.warning("Fit params are empty, the default parameters will be applied")
        else:
            self._fit_params = fit_params

        self.random_state = check_random_state(random_state)
        self.model = None
示例#5
0
def generate_concise_results_for_dataset(dataset='medoid',
                                         directory='logs_single_fold',
                                         result_directory='results'):
    ranker_names = list(ranker_options.keys())
    ranker_names.sort()
    metric_names.sort()
    data = []
    data.append(['**************', dataset.upper(), '**************', ""])
    for ranker_name in ranker_names:
        try:
            log_path = os.path.join(DIR_NAME, directory,
                                    '{}_{}.log'.format(dataset, ranker_name))
            lines = np.array([line.rstrip('\n') for line in open(log_path)])
        except FileNotFoundError:
            logger.error('File {} is not found'.format(log_path))
            data.append(['NE' for i in range(len(metric_names))])
            continue
        one_row = []
        for out in metric_names:
            try:
                matching = [s for s in lines if out in s][0]
                if out in matching:
                    one_row.append(matching.split(out + ' : ')[-1])
            except IndexError:
                logger.error(
                    'error {} in ranker {} is not evaluated for dataset {}'.
                    format(out, ranker_name, dataset))
                one_row.append('NE')
        data.append(one_row)
    columns = [name.upper() for name in metric_names]
    indexes = [name.upper() for name in ranker_names]
    indexes.insert(0, 'DATASET')
    dataFrame = pd.DataFrame(data, index=indexes, columns=columns)
    file_path = os.path.join(DIR_NAME, result_directory,
                             '{}.csv'.format(dataset))
    create_dir_recursively(file_path, True)
    dataFrame.to_csv(file_path)
    return dataFrame
    file_name_format = '{}_{}'

    if cindex <= 0:
        folder = "{}_single_fold"
        result_folder = "single_cv_results"
    else:
        folder = "{}_multiple_folds"
        result_folder = "multiple_cv_results"
        file_name_format.format(file_name_format, cindex)

    log_path = os.path.join(
        DIR_PATH, folder.format('logs'),
        file_name_format.format(dataset_str, ranker_name) + '.log')

    random_state = np.random.RandomState(seed=seed)
    create_dir_recursively(log_path, True)
    # log_path = rename_file_if_exist(log_path)
    logger = configure_logging_numpy_keras(seed=random_state.randint(2**32),
                                           log_path=log_path)
    logger.debug(arguments)
    dataset_function_params['random_state'] = random_state
    ranker, dataset_reader = get_ranker_and_dataset_functions(
        ranker_name, dataset_name, dataset_function_params, problem)

    X_train, Y_train, X_test, Y_test = dataset_reader.get_single_train_test_split(
    )
    n_features, n_objects = log_test_train_data(X_train, X_test, logger)
    ranker_params, fit_params, parameter_ranges = get_ranker_parameters(
        ranker_name, n_features, n_objects, dataset_name,
        dataset_function_params)
示例#7
0
    def __init__(self,
                 ranker,
                 optimizer_path,
                 tunable_parameter_ranges,
                 fit_params=None,
                 random_state=None,
                 tuning_callbacks=None,
                 validation_loss=None,
                 learning_problem=OBJECT_RANKING,
                 **kwd):
        """

        Parameters
        ----------
        ranker : object
            The ranker object for which the hyper parameters needs to optimized.
        optimizer_path : string
            The path where the complete optimizer pickle object can be stored.
        tunable_parameter_ranges : dict
            Dictionary with keys as the object needs to be tuned. Values is the dictionary of tunable parameters with ranges in which they should be tuned.
        fit_params : dict
            The fitting parameters for the ranker
        random_state : int or object
            Numpy random state
        tuning_callbacks : list of object
            Callbacks for the parameter optimizer
        validation_loss : function
            Differentiable loss function for the ranker
        learning_problem : string
            The learning problem under which the ranker comes
        **kwd :
            Keyword arguments for the hidden units
        """
        self.logger = logging.getLogger(PARAMETER_OPTIMIZER)

        create_dir_recursively(optimizer_path, True)
        self.optimizer_path = optimizer_path

        self._tunable_parameter_ranges = tunable_parameter_ranges

        check_ranker_class(ranker)
        self.ranker = ranker

        if tuning_callbacks is None:
            self.tuning_callbacks = []
        else:
            self.tuning_callbacks = tuning_callbacks
        default_validation_loss = {
            OBJECT_RANKING: zero_one_rank_loss,
            LABEL_RANKING: zero_one_rank_loss,
            DISCRETE_CHOICE: categorical_hinge,
            DYAD_RANKING: zero_one_rank_loss
        }
        if validation_loss is None:
            self.validation_loss = default_validation_loss[learning_problem]
            self.logger.info('Loss function is not specified, using {}'.format(
                default_validation_loss[learning_problem].__name__))
        else:
            self.validation_loss = validation_loss

        if fit_params is None:
            self._fit_params = {}
            self.logger.warning(
                "Fit params are empty, the default parameters will be applied")
        else:
            self._fit_params = fit_params

        self.random_state = check_random_state(random_state)
        self.model = None
示例#8
0
            learner_params = job_description["learner_params"]
            duration = job_description["duration"]
            hp_iters = int(job_description["hp_iters"])
            hp_ranges = job_description["hp_ranges"]
            hp_fit_params = job_description["hp_fit_params"]
            learning_problem = job_description["learning_problem"]
            experiment_schema = job_description["experiment_schema"]
            experiment_table = job_description["experiment_table"]
            validation_loss = job_description["validation_loss"]
            hash_value = job_description["hash_value"]
            random_state = np.random.RandomState(seed=seed + fold_id)
            log_path = os.path.join(DIR_PATH, LOGS_FOLDER,
                                    "{}.log".format(hash_value))
            optimizer_path = os.path.join(DIR_PATH, OPTIMIZER_FOLDER,
                                          "{}".format(hash_value))
            create_dir_recursively(log_path, True)
            create_dir_recursively(optimizer_path, True)
            setup_logging(log_path=log_path)
            configure_numpy_keras(seed=seed)
            logger = logging.getLogger('Experiment')
            logger.info("DB config filePath {}".format(config_file_path))
            logger.info("Arguments {}".format(arguments))
            logger.info("Job Description {}".format(
                print_dictionary(job_description)))
            duration = get_duration_seconds(duration)

            dataset_params['random_state'] = random_state
            dataset_params['fold_id'] = fold_id
            dataset_reader = get_dataset_reader(dataset_name, dataset_params)
            X_train, Y_train, X_test, Y_test = dataset_reader.get_single_train_test_split(
            )
示例#9
0
     columns.append(ZERO_ONE_RANK_ACCURACY)
     for ranker_name in rankers_dict[OBJECT_RANKING].keys():
         one_row = ["{}".format(ranker_name.upper())]
         df_path = os.path.join(DIR_PATH, "multiple_cv_results",
                                ('{}_{}' + '.csv').format(dataset_name, ranker_name))
         if os.path.isfile(df_path):
             df = pd.read_csv(df_path)
             df[ZERO_ONE_RANK_ACCURACY] = 1.0 - df["ZeroOneRankLossTies"]
             std = df.std(axis=0).values
             mean = df.mean(axis=0).values
             one_row.extend(["{:.3f}+-{:.3f}".format(m, s) for m, s in zip(mean, std)])
             data.append(one_row)
     if (len(data) > 0):
         df_path = os.path.join(DIR_PATH, "multiple_cv_results",
                                ('{}_{}' + '.csv').format(dataset_name, "dataset"))
         create_dir_recursively(df_path, is_file_path=True)
         dataFrame = pd.DataFrame(data, columns=columns)
         dataFrame.to_csv(df_path)
         del dataFrame["KendallsTau"]
         del dataFrame["ZeroOneRankLossTies"]
         del dataFrame["ZeroOneRankLoss"]
         print(dataFrame.to_latex())
         del dataFrame
 df_paths = os.path.join(DIR_PATH, "single_cv_results", '*.csv')
 for file_path in glob.glob(df_paths):
     print("Dataset {}".format(file_path.split('/')[-1]))
     dataFrame = pd.read_csv(file_path)
     dataFrame = dataFrame.round(3)
     del dataFrame["KendallsTau"]
     del dataFrame["ZeroOneRankLoss"]
     if "ZeroOneAccuracy" in dataFrame:
    ranker_name = (arguments['--ranker_name'])
    problem = arguments['--problem']
    duration = arguments['--duration']

    duration = get_duration_microsecond(duration)

    dataset_name, ranker_name = get_applicable_ranker_dataset(
        dataset_name=dataset_name, ranker_name=ranker_name, problem=problem)

    dataset_function_params, dataset_str = get_dataset_str(
        dataset_function_params, dataset_name)

    log_path = os.path.join(DIR_PATH, LOGS_FOLDER,
                            (FILE_FORMAT + '.log').format(
                                dataset_str, ranker_name))
    create_dir_recursively(log_path, True)
    # log_path = rename_file_if_exist(log_path)
    random_state = np.random.RandomState(seed=seed)

    logger = configure_logging_numpy_keras(seed=random_state.randint(2**32),
                                           log_path=log_path)
    logger.debug(arguments)
    logger.debug("The duration {}".format(microsec_to_time(duration)))
    logger.debug("Started the experiment at {}".format(start))

    dataset_function_params["random_state"] = random_state
    ranker, dataset_reader = get_ranker_and_dataset_functions(
        ranker_name, dataset_name, dataset_function_params, problem)

    X_train, Y_train, X_test, Y_test = dataset_reader.get_single_train_test_split(
    )