def default_setting(self): """ A default setting for data loading when performing adversarial ltr """ unknown_as_zero = False binary_rele = False # using the original values train_presort, validation_presort, test_presort = True, True, True train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 1, 100, 100 scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id) # more data settings that are rarely changed self.data_dict = dict( data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele, train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort, train_rough_batch_size=train_rough_batch_size, validation_rough_batch_size=validation_rough_batch_size, test_rough_batch_size=test_rough_batch_size, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) data_meta = get_data_meta(data_id=self.data_id) # add meta-information if self.debug: data_meta['fold_num'] = 2 self.data_dict.update(data_meta) return self.data_dict
def grid_search(self): if self.use_json: scaler_id = self.json_dict['scaler_id'] choice_min_docs = self.json_dict['min_docs'] choice_min_rele = self.json_dict['min_rele'] choice_binary_rele = self.json_dict['binary_rele'] choice_unknown_as_zero = self.json_dict['unknown_as_zero'] choice_tr_batch_size = self.json_dict[ 'tr_batch_size'] # train_rough_batch_size # hard-coding for rarely changed settings base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], train_presort=True, test_presort=True, validation_presort=True, validation_rough_batch_size=100, test_rough_batch_size=100) else: scaler_id = None choice_min_docs = [10] choice_min_rele = [1] choice_binary_rele = [False] choice_unknown_as_zero = [False] choice_tr_batch_size = [100] base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, train_presort=True, test_presort=True, validation_presort=True, validation_rough_batch_size=100, test_rough_batch_size=100) data_meta = get_data_meta(data_id=self.data_id) # add meta-information if self.debug: data_meta['fold_num'] = 1 base_data_dict.update(data_meta) scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id, scaler_id=scaler_id) for min_docs, min_rele, tr_batch_size in product( choice_min_docs, choice_min_rele, choice_tr_batch_size): threshold_dict = dict(min_docs=min_docs, min_rele=min_rele, train_rough_batch_size=tr_batch_size) for binary_rele, unknown_as_zero in product( choice_binary_rele, choice_unknown_as_zero): custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero) scale_dict = dict(scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) self.data_dict = dict() self.data_dict.update(base_data_dict) self.data_dict.update(threshold_dict) self.data_dict.update(custom_dict) self.data_dict.update(scale_dict) yield self.data_dict
def grid_search(self): """ Iterator of settings for data loading when performing adversarial ltr """ if self.use_json: scaler_id = self.json_dict['scaler_id'] choice_min_docs = self.json_dict['min_docs'] choice_min_rele = self.json_dict['min_rele'] choice_binary_rele = self.json_dict['binary_rele'] choice_unknown_as_zero = self.json_dict['unknown_as_zero'] base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], train_presort=True, test_presort=True, validation_presort=True, train_rough_batch_size=1, validation_rough_batch_size=100, test_rough_batch_size=100) else: scaler_id = None choice_min_docs = [10] choice_min_rele = [1] choice_binary_rele = [False] choice_unknown_as_zero = [False] base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, train_presort=True, test_presort=True, validation_presort=True, train_rough_batch_size=1, validation_rough_batch_size=100, test_rough_batch_size=100) data_meta = get_data_meta(data_id=self.data_id) # add meta-information base_data_dict.update(data_meta) scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id, scaler_id=scaler_id) for min_docs, min_rele in product(choice_min_docs, choice_min_rele): threshold_dict = dict(min_docs=min_docs, min_rele=min_rele) for binary_rele, unknown_as_zero in product( choice_binary_rele, choice_unknown_as_zero): custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero) scale_dict = dict(scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) self.data_dict = dict() self.data_dict.update(base_data_dict) self.data_dict.update(threshold_dict) self.data_dict.update(custom_dict) self.data_dict.update(scale_dict) yield self.data_dict
def grid_search(self): if self.use_json: scaler_id = self.json_dict['scaler_id'] choice_min_docs = self.json_dict['min_docs'] choice_min_rele = self.json_dict['min_rele'] choice_binary_rele = self.json_dict['binary_rele'] choice_unknown_as_zero = self.json_dict['unknown_as_zero'] choice_train_presort = self.json_dict['train_presort'] choice_train_batch_size = self.json_dict['train_batch_size'] # hard-coding for rarely changed settings base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], test_presort=True, validation_presort=True, validation_batch_size=1, test_batch_size=1) else: scaler_id = None choice_min_docs = [10] choice_min_rele = [1] choice_binary_rele = [False] choice_unknown_as_zero = [False] choice_train_presort = [True] choice_train_batch_size = [1] # number of sample rankings per query base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, test_presort=True, validation_presort=True, validation_batch_size=1, test_batch_size=1) data_meta = get_data_meta(data_id=self.data_id) # add meta-information base_data_dict.update(data_meta) choice_scale_data, choice_scaler_id, choice_scaler_level = \ get_scaler_setting(data_id=self.data_id, grid_search=True, scaler_id=scaler_id) for min_docs, min_rele, train_batch_size in product(choice_min_docs, choice_min_rele, choice_train_batch_size): threshold_dict = dict(min_docs=min_docs, min_rele=min_rele, train_batch_size=train_batch_size) for binary_rele, unknown_as_zero, train_presort in product(choice_binary_rele, choice_unknown_as_zero, choice_train_presort): custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, train_presort=train_presort) for scale_data, _scaler_id, scaler_level in product(choice_scale_data, choice_scaler_id, choice_scaler_level): scale_dict = dict(scale_data=scale_data, scaler_id=_scaler_id, scaler_level=scaler_level) self.data_dict = dict() self.data_dict.update(base_data_dict) self.data_dict.update(threshold_dict) self.data_dict.update(custom_dict) self.data_dict.update(scale_dict) yield self.data_dict
def default_setting(self): """ A default setting for data loading :return: """ scaler_id = None unknown_as_zero = False # using original labels, e.g., w.r.t. semi-supervised dataset binary_rele = False # using original labels train_presort, validation_presort, test_presort = True, True, True train_batch_size, validation_batch_size, test_batch_size = 1, 1, 1 scale_data, scaler_id, scaler_level = get_scaler_setting(data_id=self.data_id, scaler_id=scaler_id) # more data settings that are rarely changed self.data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, scale_data = scale_data, scaler_id = scaler_id, scaler_level = scaler_level, train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort, train_batch_size=train_batch_size, validation_batch_size=validation_batch_size, test_batch_size=test_batch_size, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele) data_meta = get_data_meta(data_id=self.data_id) # add meta-information self.data_dict.update(data_meta) return self.data_dict
def default_setting(self): """ A default setting for data loading when running lambdaMART """ scaler_id = None unknown_as_zero = True if self.data_id in MSLETOR_SEMI else False # since lambdaMART is a supervised method binary_rele = False # using the original values train_presort, validation_presort, test_presort = False, False, False train_batch_size, validation_batch_size, test_batch_size = 1, 1, 1 scale_data, scaler_id, scaler_level = get_scaler_setting(data_id=self.data_id, scaler_id=scaler_id) # more data settings that are rarely changed self.data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele, train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort, train_batch_size=train_batch_size, validation_batch_size=validation_batch_size, test_batch_size=test_batch_size, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level) data_meta = get_data_meta(data_id=self.data_id) # add meta-information self.data_dict.update(data_meta) return self.data_dict
def default_setting(self): """ A default setting for data loading :return: """ if self.use_json: scaler_id = self.json_dict['scaler_id'] min_docs = self.json_dict['min_docs'][0] min_rele = self.json_dict['min_rele'][0] binary_rele = self.json_dict['binary_rele'][0] unknown_as_zero = self.json_dict['unknown_as_zero'][0] tr_batch_size = self.json_dict['tr_batch_size'][ 0] # train_rough_batch_size scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id, scaler_id=scaler_id) # hard-coding for rarely changed settings self.data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], train_presort=True, test_presort=True, validation_presort=True, validation_rough_batch_size=100, test_rough_batch_size=100, min_docs=min_docs, min_rele=min_rele, train_rough_batch_size=tr_batch_size, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele) else: unknown_as_zero = False # using original labels, e.g., w.r.t. semi-supervised dataset binary_rele = False # using original labels train_presort, validation_presort, test_presort = True, True, True #train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 1, 100, 100 train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 100, 100, 100 scale_data, scaler_id, scaler_level = get_scaler_setting( data_id=self.data_id) # more data settings that are rarely changed self.data_dict = dict( data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1, scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level, train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort, train_rough_batch_size=train_rough_batch_size, validation_rough_batch_size=validation_rough_batch_size, test_rough_batch_size=test_rough_batch_size, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele) data_meta = get_data_meta(data_id=self.data_id) # add meta-information if self.debug: data_meta['fold_num'] = 2 self.data_dict.update(data_meta) return self.data_dict