def default_setting(self): """ A default setting for data loading :return: """ unknown_as_zero = False # using original labels, e.g., w.r.t. semi-supervised dataset binary_rele = False # using original labels train_presort, validation_presort, test_presort = True, True, True train_batch_size, validation_batch_size, test_batch_size = 1, 1, 1 scale_data, scaler_id, scaler_level = get_default_scaler_setting( data_id=self.data_id) # more data settings that are rarely changed self.data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level, train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort, train_batch_size=train_batch_size, validation_batch_size=validation_batch_size, test_batch_size=test_batch_size, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele) data_meta = get_data_meta(data_id=self.data_id) # add meta-information self.data_dict.update(data_meta) return self.data_dict
def default_setting(self): """ A default setting for data loading when performing adversarial ltr """ unknown_as_zero = False binary_rele = False # using the original values train_presort, validation_presort, test_presort = True, True, True train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 1, 100, 100 scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id) # more data settings that are rarely changed self.data_dict = dict( data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele, train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort, train_rough_batch_size=train_rough_batch_size, validation_rough_batch_size=validation_rough_batch_size, test_rough_batch_size=test_rough_batch_size, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) data_meta = get_data_meta(data_id=self.data_id) # add meta-information if self.debug: data_meta['fold_num'] = 2 self.data_dict.update(data_meta) return self.data_dict
def default_setting(self): """ A default setting for data loading when running lambdaMART """ unknown_as_zero = True if self.data_id in MSLETOR_SEMI else False # since lambdaMART is a supervised method binary_rele = False # using the original values train_presort, validation_presort, test_presort = False, False, False train_batch_size, validation_batch_size, test_batch_size = 1, 1, 1 scale_data, scaler_id, scaler_level = get_default_scaler_setting( data_id=self.data_id) # more data settings that are rarely changed self.data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele, train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort, train_batch_size=train_batch_size, validation_batch_size=validation_batch_size, test_batch_size=test_batch_size, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) data_meta = get_data_meta(data_id=self.data_id) # add meta-information self.data_dict.update(data_meta) return self.data_dict
def default_setting(self): """ A default setting for data loading when running lambdaMART :return: """ unknown_as_zero = True if self.data_id in MSLETOR_SEMI else False # since lambdaMART is a supervised method binary_rele = False # using the original values presort = True # this setting leads to no difference for lambdaMART, but it can be altered to reused buffered data scale_data, scaler_id, scaler_level = get_default_scaler_setting( data_id=self.data_id) # more data settings that are rarely changed self.data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, sample_rankings_per_q=1, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele, presort=presort, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) data_meta = get_data_meta(data_id=self.data_id) # add meta-information self.data_dict.update(data_meta) return self.data_dict
def default_setting(self): """ A default setting for data loading :return: """ unknown_as_zero = True if self.data_id in MSLETOR_SEMI else False binary_rele = False # using the original values presort = True # a default setting scale_data, scaler_id, scaler_level = get_default_scaler_setting( data_id=self.data_id) # more data settings that are rarely changed self.data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, sample_rankings_per_q=1, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele, presort=presort, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) data_meta = get_data_meta(data_id=self.data_id) # add meta-information self.data_dict.update(data_meta) return self.data_dict
def grid_search(self): """ Iterator of settings for data loading when performing adversarial ltr :param debug: :param data_id: :param dir_data: :return: """ ''' common settings without grid-search ''' binary_rele, unknown_as_zero = False, False common_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele) data_meta = get_data_meta(data_id=self.data_id) # add meta-information common_data_dict.update(data_meta) ''' some settings for grid-search ''' choice_presort = [True] if self.debug else [True] choice_sample_rankings_per_q = [1] if self.debug else [1] # number of sample rankings per query choice_scale_data, choice_scaler_id, choice_scaler_level = get_default_scaler_setting(data_id=self.data_id, grid_search=True) for scale_data, scaler_id, scaler_level, presort, sample_rankings_per_q in product(choice_scale_data, choice_scaler_id, choice_scaler_level, choice_presort, choice_sample_rankings_per_q): self.data_dict = dict(presort=presort, sample_rankings_per_q=sample_rankings_per_q, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) self.data_dict.update(common_data_dict) yield self.data_dict
def grid_search(self): if self.use_json: scaler_id = self.json_dict['scaler_id'] choice_min_docs = self.json_dict['min_docs'] choice_min_rele = self.json_dict['min_rele'] choice_binary_rele = self.json_dict['binary_rele'] choice_unknown_as_zero = self.json_dict['unknown_as_zero'] choice_tr_batch_size = self.json_dict[ 'tr_batch_size'] # train_rough_batch_size # hard-coding for rarely changed settings base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], train_presort=True, test_presort=True, validation_presort=True, validation_rough_batch_size=100, test_rough_batch_size=100) else: scaler_id = None choice_min_docs = [10] choice_min_rele = [1] choice_binary_rele = [False] choice_unknown_as_zero = [False] choice_tr_batch_size = [100] base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, train_presort=True, test_presort=True, validation_presort=True, validation_rough_batch_size=100, test_rough_batch_size=100) data_meta = get_data_meta(data_id=self.data_id) # add meta-information if self.debug: data_meta['fold_num'] = 1 base_data_dict.update(data_meta) scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id, scaler_id=scaler_id) for min_docs, min_rele, tr_batch_size in product( choice_min_docs, choice_min_rele, choice_tr_batch_size): threshold_dict = dict(min_docs=min_docs, min_rele=min_rele, train_rough_batch_size=tr_batch_size) for binary_rele, unknown_as_zero in product( choice_binary_rele, choice_unknown_as_zero): custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero) scale_dict = dict(scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) self.data_dict = dict() self.data_dict.update(base_data_dict) self.data_dict.update(threshold_dict) self.data_dict.update(custom_dict) self.data_dict.update(scale_dict) yield self.data_dict
def grid_search(self): """ Iterator of settings for data loading when performing adversarial ltr """ if self.use_json: scaler_id = self.json_dict['scaler_id'] choice_min_docs = self.json_dict['min_docs'] choice_min_rele = self.json_dict['min_rele'] choice_binary_rele = self.json_dict['binary_rele'] choice_unknown_as_zero = self.json_dict['unknown_as_zero'] base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], train_presort=True, test_presort=True, validation_presort=True, train_rough_batch_size=1, validation_rough_batch_size=100, test_rough_batch_size=100) else: scaler_id = None choice_min_docs = [10] choice_min_rele = [1] choice_binary_rele = [False] choice_unknown_as_zero = [False] base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, train_presort=True, test_presort=True, validation_presort=True, train_rough_batch_size=1, validation_rough_batch_size=100, test_rough_batch_size=100) data_meta = get_data_meta(data_id=self.data_id) # add meta-information base_data_dict.update(data_meta) scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id, scaler_id=scaler_id) for min_docs, min_rele in product(choice_min_docs, choice_min_rele): threshold_dict = dict(min_docs=min_docs, min_rele=min_rele) for binary_rele, unknown_as_zero in product( choice_binary_rele, choice_unknown_as_zero): custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero) scale_dict = dict(scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) self.data_dict = dict() self.data_dict.update(base_data_dict) self.data_dict.update(threshold_dict) self.data_dict.update(custom_dict) self.data_dict.update(scale_dict) yield self.data_dict
def grid_search(self): if self.data_json is not None: # using json file choice_presort = self.json_dict['presort'] choice_min_docs = self.json_dict['min_docs'] choice_min_rele = self.json_dict['min_rele'] choice_binary_rele = self.json_dict['binary_rele'] choice_unknown_as_zero = self.json_dict['unknown_as_zero'] choice_sample_rankings_per_q = self.json_dict[ 'sample_rankings_per_q'] base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"]) else: choice_min_docs = [10] choice_min_rele = [1] choice_presort = [True] choice_binary_rele = [False] choice_unknown_as_zero = [True] choice_sample_rankings_per_q = [ 1 ] # number of sample rankings per query base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data) data_meta = get_data_meta(data_id=self.data_id) # add meta-information base_data_dict.update(data_meta) choice_scale_data, choice_scaler_id, choice_scaler_level = get_default_scaler_setting( data_id=self.data_id, grid_search=True) for min_docs, min_rele, sample_rankings_per_q in product( choice_min_docs, choice_min_rele, choice_sample_rankings_per_q): threshold_dict = dict(min_docs=min_docs, min_rele=min_rele, sample_rankings_per_q=sample_rankings_per_q) for binary_rele, unknown_as_zero, presort in product( choice_binary_rele, choice_unknown_as_zero, choice_presort): custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, presort=presort) for scale_data, scaler_id, scaler_level in product( choice_scale_data, choice_scaler_id, choice_scaler_level): scale_dict = dict(scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) self.data_dict = dict() self.data_dict.update(base_data_dict) self.data_dict.update(threshold_dict) self.data_dict.update(custom_dict) self.data_dict.update(scale_dict) yield self.data_dict
def set_model_setting(self, model_id=None, data_id=None, dir_json=None, debug=False): """ Initialize the parameter class for a specified model :param debug: :param model_id: :param data_id: :return: """ if model_id in [ 'RankMSE', 'RankNet', 'ListNet', 'ListMLE', 'RankCosine' ]: # the 1st type with model_id, where ModelParameter is sufficient self.model_parameter = ModelParameter(model_id=model_id) elif model_id in [ 'LambdaRank', 'ApproxNDCG', 'DirectOpt', 'MarginLambdaLoss' ]: # the 2nd type, where the information of the type of relevance label is required. data_meta = get_data_meta(data_id=data_id) # add meta-information if data_meta['multi_level_rele']: if dir_json is not None: para_json = dir_json + model_id + "Parameter.json" self.model_parameter = globals()[model_id + "Parameter"]( para_json=para_json, std_rele_is_permutation=False) else: self.model_parameter = globals()[model_id + "Parameter"]( debug=debug, std_rele_is_permutation=False) else: # the case like MSLETOR_LIST if dir_json is not None: para_json = dir_json + model_id + "Parameter.json" self.model_parameter = globals()[model_id + "Parameter"]( para_json=para_json, std_rele_is_permutation=True) else: self.model_parameter = globals()[model_id + "Parameter"]( debug=debug, std_rele_is_permutation=True) else: # the 3rd type, where debug-mode enables quick test if dir_json is not None: para_json = dir_json + model_id + "Parameter.json" self.model_parameter = globals()[model_id + "Parameter"]( para_json=para_json) else: self.model_parameter = globals()[model_id + "Parameter"](debug=debug)
def load_setting(self): if self.use_json: choice_min_docs = self.json_dict['min_docs'] choice_min_rele = self.json_dict['min_rele'] choice_binary_rele = self.json_dict['binary_rele'] choice_unknown_as_zero = self.json_dict['unknown_as_zero'] choice_train_presort = self.json_dict['train_presort'] choice_train_batch_size = self.json_dict['train_batch_size'] # hard-coding for rarely changed settings base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], test_presort=True, validation_presort=True, validation_batch_size=1, test_batch_size=1) else: choice_min_docs = [10] choice_min_rele = [1] choice_binary_rele = [False] choice_unknown_as_zero = [False] choice_train_presort = [True] choice_train_batch_size = [1] # number of sample rankings per query base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, test_presort=True, validation_presort=True, validation_batch_size=1, test_batch_size=1) data_meta = get_data_meta(data_id=self.data_id) # add meta-information base_data_dict.update(data_meta) choice_scale_data, choice_scaler_id, choice_scaler_level = get_default_scaler_setting(data_id=self.data_id, grid_search=True) for min_docs, min_rele, train_batch_size in product(choice_min_docs, choice_min_rele, choice_train_batch_size): threshold_dict = dict(min_docs=min_docs, min_rele=min_rele, train_batch_size=train_batch_size) for binary_rele, unknown_as_zero, train_presort in product(choice_binary_rele, choice_unknown_as_zero, choice_train_presort): custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, train_presort=train_presort) for scale_data, scaler_id, scaler_level in product(choice_scale_data, choice_scaler_id, choice_scaler_level): scale_dict = dict(scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) self.data_dict = dict() self.data_dict.update(base_data_dict) self.data_dict.update(threshold_dict) self.data_dict.update(custom_dict) self.data_dict.update(scale_dict) return self.data_dict
def check_dataset_statistics(data_id, dir_data, buffer=False): ''' Get the basic statistics on the specified dataset ''' if data_id in YAHOO_LTR: data_prefix = dir_data + data_id.lower() + '.' file_train, file_vali, file_test = data_prefix + 'train.txt', data_prefix + 'valid.txt', data_prefix + 'test.txt' elif data_id in ISTELLA_LTR: data_prefix = dir_data + data_id + '/' if data_id == 'Istella_X' or data_id == 'Istella_S': file_train, file_vali, file_test = data_prefix + 'train.txt', data_prefix + 'vali.txt', data_prefix + 'test.txt' else: file_train, file_test = data_prefix + 'train.txt', data_prefix + 'test.txt' else: fold_k = 1 fold_k_dir = dir_data + 'Fold' + str(fold_k) + '/' file_train, file_vali, file_test = fold_k_dir + 'train.txt', fold_k_dir + 'vali.txt', fold_k_dir + 'test.txt' # common if 'Istella' == data_id: train_dataset = LTRDataset(split_type=SPLIT_TYPE.Train, file=file_train, data_id=data_id, shuffle=False, buffer=buffer) test_dataset = LTRDataset(split_type=SPLIT_TYPE.Test, file=file_test, data_id=data_id, shuffle=False, buffer=buffer) num_queries = train_dataset.__len__() + test_dataset.__len__() print('Dataset:\t', data_id) print('Total queries:\t', num_queries) print('\tTrain:', train_dataset.__len__(), 'Test:', test_dataset.__len__()) num_docs = get_doc_num(train_dataset) + get_doc_num(test_dataset) print('Total docs:\t', num_docs) min_doc, max_doc, sum_rele = get_min_max_docs( train_dataset=train_dataset, vali_dataset=None, test_dataset=test_dataset) data_meta = get_data_meta(data_id=data_id) max_rele_label = data_meta['max_rele_level'] sum_bin_cnts = get_label_distribution(train_dataset=train_dataset, test_dataset=test_dataset, semi_supervised=False, max_lavel=max_rele_label) else: train_dataset = LTRDataset(split_type=SPLIT_TYPE.Train, file=file_train, data_id=data_id, shuffle=False, buffer=buffer) vali_dataset = LTRDataset(split_type=SPLIT_TYPE.Validation, file=file_vali, data_id=data_id, shuffle=False, buffer=buffer) test_dataset = LTRDataset(split_type=SPLIT_TYPE.Test, file=file_test, data_id=data_id, shuffle=False, buffer=buffer) num_queries = train_dataset.__len__() + vali_dataset.__len__( ) + test_dataset.__len__() print('Dataset:\t', data_id) print('Total queries:\t', num_queries) print('\tTrain:', train_dataset.__len__(), 'Vali:', vali_dataset.__len__(), 'Test:', test_dataset.__len__()) num_docs = get_doc_num(train_dataset) + get_doc_num( vali_dataset) + get_doc_num(test_dataset) print('Total docs:\t', num_docs) if data_id in MSLETOR_SEMI: min_doc, max_doc, sum_rele, sum_unknown = \ get_min_max_docs(train_dataset=train_dataset, vali_dataset=vali_dataset, test_dataset=test_dataset, semi_supervised=True) else: min_doc, max_doc, sum_rele = get_min_max_docs( train_dataset=train_dataset, vali_dataset=vali_dataset, test_dataset=test_dataset) data_meta = get_data_meta(data_id=data_id) max_rele_label = data_meta['max_rele_level'] sum_bin_cnts = get_label_distribution(train_dataset=train_dataset, vali_dataset=vali_dataset, test_dataset=test_dataset, semi_supervised=False, max_lavel=max_rele_label) print('min, max documents per query', min_doc, max_doc) print('total relevant documents', sum_rele) print('avg rele documents per query', sum_rele * 1.0 / num_queries) print('avg documents per query', num_docs * 1.0 / num_queries) print('label distribution: ', sum_bin_cnts) if data_id in MSLETOR_SEMI: print('total unlabeled documents', sum_unknown)
def grid_search(self): """ Iterator of settings for data loading when performing adversarial ltr """ if self.ad_data_json is not None: # using json file choice_min_docs = self.json_dict['min_docs'] choice_min_rele = self.json_dict['min_rele'] choice_binary_rele = self.json_dict['binary_rele'] choice_unknown_as_zero = self.json_dict['unknown_as_zero'] choice_train_presort = self.json_dict['train_presort'] choice_train_batch_size = self.json_dict['train_batch_size'] base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], test_presort=True, validation_presort=True, validation_batch_size=1, test_batch_size=1) else: choice_min_docs = [10] choice_min_rele = [1] choice_binary_rele = [False] choice_unknown_as_zero = [False] choice_train_presort = [True] choice_train_batch_size = [1 ] # number of sample rankings per query base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, test_presort=True, validation_presort=True, validation_batch_size=1, test_batch_size=1) data_meta = get_data_meta(data_id=self.data_id) # add meta-information base_data_dict.update(data_meta) choice_scale_data, choice_scaler_id, choice_scaler_level = get_default_scaler_setting( data_id=self.data_id, grid_search=True) for min_docs, min_rele, train_batch_size in product( choice_min_docs, choice_min_rele, choice_train_batch_size): threshold_dict = dict(min_docs=min_docs, min_rele=min_rele, train_batch_size=train_batch_size) for binary_rele, unknown_as_zero, train_presort in product( choice_binary_rele, choice_unknown_as_zero, choice_train_presort): custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, train_presort=train_presort) for scale_data, scaler_id, scaler_level in product( choice_scale_data, choice_scaler_id, choice_scaler_level): scale_dict = dict(scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) self.data_dict = dict() self.data_dict.update(base_data_dict) self.data_dict.update(threshold_dict) self.data_dict.update(custom_dict) self.data_dict.update(scale_dict) yield self.data_dict
def default_setting(self): """ A default setting for data loading :return: """ if self.use_json: scaler_id = self.json_dict['scaler_id'] min_docs = self.json_dict['min_docs'][0] min_rele = self.json_dict['min_rele'][0] binary_rele = self.json_dict['binary_rele'][0] unknown_as_zero = self.json_dict['unknown_as_zero'][0] tr_batch_size = self.json_dict['tr_batch_size'][ 0] # train_rough_batch_size scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id, scaler_id=scaler_id) # hard-coding for rarely changed settings self.data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], train_presort=True, test_presort=True, validation_presort=True, validation_rough_batch_size=100, test_rough_batch_size=100, min_docs=min_docs, min_rele=min_rele, train_rough_batch_size=tr_batch_size, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele) else: unknown_as_zero = False # using original labels, e.g., w.r.t. semi-supervised dataset binary_rele = False # using original labels train_presort, validation_presort, test_presort = True, True, True #train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 1, 100, 100 train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 100, 100, 100 scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id) # more data settings that are rarely changed self.data_dict = dict( data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level, train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort, train_rough_batch_size=train_rough_batch_size, validation_rough_batch_size=validation_rough_batch_size, test_rough_batch_size=test_rough_batch_size, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele) data_meta = get_data_meta(data_id=self.data_id) # add meta-information if self.debug: data_meta['fold_num'] = 2 self.data_dict.update(data_meta) return self.data_dict