Exemplo n.º 1
0
    def default_setting(self):
        """
		A default setting for data loading when performing adversarial ltr
		"""
        unknown_as_zero = False
        binary_rele = False  # using the original values
        train_presort, validation_presort, test_presort = True, True, True
        train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 1, 100, 100
        scale_data, scaler_id, scaler_level = get_scaler_setting(
            data_id=self.data_id)

        # more data settings that are rarely changed
        self.data_dict = dict(
            data_id=self.data_id,
            dir_data=self.dir_data,
            min_docs=10,
            min_rele=1,
            unknown_as_zero=unknown_as_zero,
            binary_rele=binary_rele,
            train_presort=train_presort,
            validation_presort=validation_presort,
            test_presort=test_presort,
            train_rough_batch_size=train_rough_batch_size,
            validation_rough_batch_size=validation_rough_batch_size,
            test_rough_batch_size=test_rough_batch_size,
            scale_data=scale_data,
            scaler_id=scaler_id,
            scaler_level=scaler_level)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        if self.debug: data_meta['fold_num'] = 2
        self.data_dict.update(data_meta)

        return self.data_dict
Exemplo n.º 2
0
    def grid_search(self):
        if self.use_json:
            scaler_id = self.json_dict['scaler_id']
            choice_min_docs = self.json_dict['min_docs']
            choice_min_rele = self.json_dict['min_rele']
            choice_binary_rele = self.json_dict['binary_rele']
            choice_unknown_as_zero = self.json_dict['unknown_as_zero']
            choice_tr_batch_size = self.json_dict[
                'tr_batch_size']  # train_rough_batch_size
            # hard-coding for rarely changed settings
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.json_dict["dir_data"],
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100)
        else:
            scaler_id = None
            choice_min_docs = [10]
            choice_min_rele = [1]
            choice_binary_rele = [False]
            choice_unknown_as_zero = [False]
            choice_tr_batch_size = [100]
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.dir_data,
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        if self.debug: data_meta['fold_num'] = 1
        base_data_dict.update(data_meta)

        scale_data, scaler_id, scaler_level = get_scaler_setting(
            data_id=self.data_id, scaler_id=scaler_id)

        for min_docs, min_rele, tr_batch_size in product(
                choice_min_docs, choice_min_rele, choice_tr_batch_size):
            threshold_dict = dict(min_docs=min_docs,
                                  min_rele=min_rele,
                                  train_rough_batch_size=tr_batch_size)

            for binary_rele, unknown_as_zero in product(
                    choice_binary_rele, choice_unknown_as_zero):
                custom_dict = dict(binary_rele=binary_rele,
                                   unknown_as_zero=unknown_as_zero)
                scale_dict = dict(scale_data=scale_data,
                                  scaler_id=scaler_id,
                                  scaler_level=scaler_level)

                self.data_dict = dict()
                self.data_dict.update(base_data_dict)
                self.data_dict.update(threshold_dict)
                self.data_dict.update(custom_dict)
                self.data_dict.update(scale_dict)
                yield self.data_dict
Exemplo n.º 3
0
    def grid_search(self):
        """
		Iterator of settings for data loading when performing adversarial ltr
		"""
        if self.use_json:
            scaler_id = self.json_dict['scaler_id']
            choice_min_docs = self.json_dict['min_docs']
            choice_min_rele = self.json_dict['min_rele']
            choice_binary_rele = self.json_dict['binary_rele']
            choice_unknown_as_zero = self.json_dict['unknown_as_zero']
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.json_dict["dir_data"],
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  train_rough_batch_size=1,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100)
        else:
            scaler_id = None
            choice_min_docs = [10]
            choice_min_rele = [1]
            choice_binary_rele = [False]
            choice_unknown_as_zero = [False]
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.dir_data,
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  train_rough_batch_size=1,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        base_data_dict.update(data_meta)

        scale_data, scaler_id, scaler_level = get_scaler_setting(
            data_id=self.data_id, scaler_id=scaler_id)

        for min_docs, min_rele in product(choice_min_docs, choice_min_rele):
            threshold_dict = dict(min_docs=min_docs, min_rele=min_rele)

            for binary_rele, unknown_as_zero in product(
                    choice_binary_rele, choice_unknown_as_zero):
                custom_dict = dict(binary_rele=binary_rele,
                                   unknown_as_zero=unknown_as_zero)
                scale_dict = dict(scale_data=scale_data,
                                  scaler_id=scaler_id,
                                  scaler_level=scaler_level)

                self.data_dict = dict()
                self.data_dict.update(base_data_dict)
                self.data_dict.update(threshold_dict)
                self.data_dict.update(custom_dict)
                self.data_dict.update(scale_dict)
                yield self.data_dict
Exemplo n.º 4
0
    def grid_search(self):
        if self.use_json:
            scaler_id = self.json_dict['scaler_id']
            choice_min_docs = self.json_dict['min_docs']
            choice_min_rele = self.json_dict['min_rele']
            choice_binary_rele = self.json_dict['binary_rele']
            choice_unknown_as_zero = self.json_dict['unknown_as_zero']
            choice_train_presort = self.json_dict['train_presort']
            choice_train_batch_size = self.json_dict['train_batch_size']
            # hard-coding for rarely changed settings
            base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], test_presort=True,
                                  validation_presort=True, validation_batch_size=1, test_batch_size=1)
        else:
            scaler_id = None
            choice_min_docs = [10]
            choice_min_rele = [1]
            choice_binary_rele = [False]
            choice_unknown_as_zero = [False]
            choice_train_presort = [True]
            choice_train_batch_size = [1] # number of sample rankings per query

            base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, test_presort=True,
                                  validation_presort=True, validation_batch_size=1, test_batch_size=1)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        base_data_dict.update(data_meta)

        choice_scale_data, choice_scaler_id, choice_scaler_level = \
            get_scaler_setting(data_id=self.data_id, grid_search=True, scaler_id=scaler_id)

        for min_docs, min_rele, train_batch_size in product(choice_min_docs, choice_min_rele, choice_train_batch_size):
            threshold_dict = dict(min_docs=min_docs, min_rele=min_rele, train_batch_size=train_batch_size)

            for binary_rele, unknown_as_zero, train_presort in product(choice_binary_rele, choice_unknown_as_zero, choice_train_presort):
                custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, train_presort=train_presort)

                for scale_data, _scaler_id, scaler_level in product(choice_scale_data, choice_scaler_id, choice_scaler_level):
                    scale_dict = dict(scale_data=scale_data, scaler_id=_scaler_id, scaler_level=scaler_level)

                    self.data_dict = dict()
                    self.data_dict.update(base_data_dict)
                    self.data_dict.update(threshold_dict)
                    self.data_dict.update(custom_dict)
                    self.data_dict.update(scale_dict)
                    yield self.data_dict
Exemplo n.º 5
0
    def default_setting(self):
        """
        A default setting for data loading
        :return:
        """
        scaler_id = None
        unknown_as_zero = False # using original labels, e.g., w.r.t. semi-supervised dataset
        binary_rele = False  # using original labels
        train_presort, validation_presort, test_presort = True, True, True
        train_batch_size, validation_batch_size, test_batch_size = 1, 1, 1
        scale_data, scaler_id, scaler_level = get_scaler_setting(data_id=self.data_id, scaler_id=scaler_id)

        # more data settings that are rarely changed
        self.data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1,
                              scale_data = scale_data, scaler_id = scaler_id, scaler_level = scaler_level,
                              train_presort=train_presort, validation_presort=validation_presort, test_presort=test_presort,
                              train_batch_size=train_batch_size, validation_batch_size=validation_batch_size,
                              test_batch_size=test_batch_size, unknown_as_zero=unknown_as_zero, binary_rele=binary_rele)

        data_meta = get_data_meta(data_id=self.data_id) # add meta-information
        self.data_dict.update(data_meta)

        return self.data_dict
Exemplo n.º 6
0
    def default_setting(self):
        """
        A default setting for data loading when running lambdaMART
        """
        scaler_id = None
        unknown_as_zero = True if self.data_id in MSLETOR_SEMI else False # since lambdaMART is a supervised method
        binary_rele = False  # using the original values
        train_presort, validation_presort, test_presort = False, False, False
        train_batch_size, validation_batch_size, test_batch_size = 1, 1, 1

        scale_data, scaler_id, scaler_level = get_scaler_setting(data_id=self.data_id, scaler_id=scaler_id)

        # more data settings that are rarely changed
        self.data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1,
                unknown_as_zero=unknown_as_zero, binary_rele=binary_rele, train_presort=train_presort,
                validation_presort=validation_presort, test_presort=test_presort, train_batch_size=train_batch_size,
                validation_batch_size=validation_batch_size, test_batch_size=test_batch_size,
                              scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        self.data_dict.update(data_meta)

        return self.data_dict
Exemplo n.º 7
0
    def default_setting(self):
        """
        A default setting for data loading
        :return:
        """
        if self.use_json:
            scaler_id = self.json_dict['scaler_id']
            min_docs = self.json_dict['min_docs'][0]
            min_rele = self.json_dict['min_rele'][0]
            binary_rele = self.json_dict['binary_rele'][0]
            unknown_as_zero = self.json_dict['unknown_as_zero'][0]
            tr_batch_size = self.json_dict['tr_batch_size'][
                0]  # train_rough_batch_size

            scale_data, scaler_id, scaler_level = get_scaler_setting(
                data_id=self.data_id, scaler_id=scaler_id)

            # hard-coding for rarely changed settings
            self.data_dict = dict(data_id=self.data_id,
                                  dir_data=self.json_dict["dir_data"],
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100,
                                  min_docs=min_docs,
                                  min_rele=min_rele,
                                  train_rough_batch_size=tr_batch_size,
                                  scale_data=scale_data,
                                  scaler_id=scaler_id,
                                  scaler_level=scaler_level,
                                  unknown_as_zero=unknown_as_zero,
                                  binary_rele=binary_rele)
        else:
            unknown_as_zero = False  # using original labels, e.g., w.r.t. semi-supervised dataset
            binary_rele = False  # using original labels
            train_presort, validation_presort, test_presort = True, True, True
            #train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 1, 100, 100
            train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 100, 100, 100
            scale_data, scaler_id, scaler_level = get_scaler_setting(
                data_id=self.data_id)

            # more data settings that are rarely changed
            self.data_dict = dict(
                data_id=self.data_id,
                dir_data=self.dir_data,
                min_docs=10,
                min_rele=1,
                scale_data=scale_data,
                scaler_id=scaler_id,
                scaler_level=scaler_level,
                train_presort=train_presort,
                validation_presort=validation_presort,
                test_presort=test_presort,
                train_rough_batch_size=train_rough_batch_size,
                validation_rough_batch_size=validation_rough_batch_size,
                test_rough_batch_size=test_rough_batch_size,
                unknown_as_zero=unknown_as_zero,
                binary_rele=binary_rele)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information

        if self.debug: data_meta['fold_num'] = 2
        self.data_dict.update(data_meta)

        return self.data_dict