예제 #1
0
 def update_curt_select_param(self):
     new_select_properties = SelectionProperties()
     new_select_properties.set_header(self.curt_select_properties.header)
     new_select_properties.set_last_left_col_indexes(
         self.curt_select_properties.all_left_col_indexes)
     new_select_properties.add_select_col_names(
         self.curt_select_properties.left_col_names)
     self.curt_select_properties = new_select_properties
예제 #2
0
 def sync_select_cols(self, suffix=tuple()):
     host_select_col_names = self._host_select_cols_transfer.get(
         idx=-1, suffix=suffix)
     host_selection_params = []
     for host_id, select_names in enumerate(host_select_col_names):
         host_selection_properties = SelectionProperties()
         host_selection_properties.set_header(select_names)
         host_selection_properties.set_last_left_col_indexes(
             [x for x in range(len(select_names))])
         host_selection_properties.add_select_col_names(select_names)
         host_selection_params.append(host_selection_properties)
     return host_selection_params
 def update_curt_select_param(self):
     new_select_properties = SelectionProperties()
     new_select_properties.set_header(self.curt_select_properties.header)
     new_select_properties.set_last_left_col_indexes(
         self.curt_select_properties.all_left_col_indexes)
     new_select_properties.add_select_col_names(
         self.curt_select_properties.left_col_names)
     LOGGER.debug("In update_curt_select_param, header: {}, cols_map: {},"
                  "last_left_col_indexes: {}, select_col_names: {}".format(
                      new_select_properties.header,
                      new_select_properties.col_name_maps,
                      new_select_properties.last_left_col_indexes,
                      new_select_properties.select_col_names))
     self.curt_select_properties = new_select_properties
예제 #4
0
 def _run_filter(self, data_table):
     select_param = FeatureSelectionParam()
     select_param.percentage_value_param.upper_pct = 0.2
     filter_obj = get_filter(consts.PERCENTAGE_VALUE, select_param)
     select_properties = SelectionProperties()
     select_properties.set_header(self.header)
     select_properties.set_last_left_col_indexes(
         [x for x in range(len(self.header))])
     select_properties.set_select_all_cols()
     filter_obj.set_selection_properties(select_properties)
     res_select_properties = filter_obj.fit(data_table,
                                            suffix='').selection_properties
     left_cols = [3, 4]
     self.assertEqual(res_select_properties.all_left_col_names,
                      [self.header[x] for x in left_cols])
예제 #5
0
 def test_unique_logic(self):
     data_table = self.gen_data(1000, 48)
     select_param = FeatureSelectionParam()
     filter_obj = get_filter(consts.UNIQUE_VALUE, select_param)
     select_properties = SelectionProperties()
     select_properties.set_header(self.header)
     select_properties.set_last_left_col_indexes(
         [x for x in range(len(self.header))])
     select_properties.set_select_all_cols()
     filter_obj.set_selection_properties(select_properties)
     res_select_properties = filter_obj.fit(data_table,
                                            suffix='').selection_properties
     self.assertEqual(res_select_properties.all_left_col_names,
                      [self.header[1]])
     data_table.destroy()
예제 #6
0
 def test_left_logic(self):
     data_table = self.gen_data(1000, 10, 48)
     select_param = FeatureSelectionParam()
     select_param.manually_param.left_col_indexes = [0, 1]
     select_param.manually_param.left_col_names = ['3', '2']
     filter_obj = get_filter(consts.MANUALLY_FILTER, select_param)
     select_properties = SelectionProperties()
     select_properties.set_header(self.header)
     select_properties.set_last_left_col_indexes(
         [x for x in range(len(self.header))])
     select_properties.set_select_all_cols()
     filter_obj.set_selection_properties(select_properties)
     res_select_properties = filter_obj.fit(data_table,
                                            suffix='').selection_properties
     result = ['0', '1', '2', '3']
     self.assertEqual(res_select_properties.all_left_col_names, result)
예제 #7
0
    def test_unique_logic(self):
        data_table = self.gen_data(1000, 10, 48)
        select_param = FeatureSelectionParam()
        select_param.variance_coe_param.value_threshold = 0.1
        filter_obj = get_filter(consts.COEFFICIENT_OF_VARIATION_VALUE_THRES, select_param)
        select_properties = SelectionProperties()
        select_properties.set_header(self.header)
        select_properties.set_last_left_col_indexes([x for x in range(len(self.header))])
        select_properties.set_select_all_cols()
        filter_obj.set_selection_properties(select_properties)
        res_select_properties = filter_obj.fit(data_table, suffix='').selection_properties
        result = [self.header[idx] for idx, x in enumerate(self.coe_list)
                  if x >= select_param.variance_coe_param.value_threshold]

        self.assertEqual(res_select_properties.all_left_col_names, result)
        self.assertEqual(len(res_select_properties.all_left_col_names), 9)
        data_table.destroy()
예제 #8
0
    def test_filter_logic(self):
        data_table = self.gen_data(1000, 10, 48)
        select_param = FeatureSelectionParam()
        select_param.outlier_param.percentile = 0.9
        select_param.outlier_param.upper_threshold = 99
        filter_obj = get_filter(consts.OUTLIER_COLS, select_param)
        select_properties = SelectionProperties()
        select_properties.set_header(self.header)
        select_properties.set_last_left_col_indexes(
            [x for x in range(len(self.header))])
        select_properties.set_select_all_cols()
        filter_obj.set_selection_properties(select_properties)
        res_select_properties = filter_obj.fit(data_table,
                                               suffix='').selection_properties

        self.assertEqual(res_select_properties.all_left_col_names,
                         [self.header[x] for x in range(9)])
        self.assertEqual(len(res_select_properties.all_left_col_names), 9)
        data_table.destroy()
예제 #9
0
class BaseHeteroFeatureSelection(ModelBase):
    def __init__(self):
        super(BaseHeteroFeatureSelection, self).__init__()
        self.transfer_variable = HeteroFeatureSelectionTransferVariable()

        self.curt_select_properties = SelectionProperties()
        self.completed_selection_result = CompletedSelectionResults()

        self.schema = None
        self.header = None
        self.party_name = 'Base'
        # Possible previous model
        self.binning_model = None
        self.static_obj = None
        self.model_param = FeatureSelectionParam()
        # self.meta_dicts = {}
        self.meta_list = []
        self.isometric_models = {}

    def _init_model(self, params):
        self.model_param = params
        # self.cols_index = params.select_cols
        self.filter_methods = params.filter_methods
        # self.local_only = params.local_only

    def _init_select_params(self, data_instances):
        if self.schema is None:
            self.schema = data_instances.schema

        if self.header is not None:
            return
        self.schema = data_instances.schema
        header = get_header(data_instances)
        self.header = header
        self.curt_select_properties.set_header(header)
        self.curt_select_properties.set_last_left_col_indexes(
            [x for x in range(len(header))])
        if self.model_param.select_col_indexes == -1:
            self.curt_select_properties.set_select_all_cols()
        else:
            self.curt_select_properties.add_select_col_indexes(
                self.model_param.select_col_indexes)
        self.curt_select_properties.add_select_col_names(
            self.model_param.select_names)
        self.completed_selection_result.set_header(header)
        self.completed_selection_result.set_select_col_names(
            self.curt_select_properties.select_col_names)
        self.completed_selection_result.set_all_left_col_indexes(
            self.curt_select_properties.all_left_col_indexes)

    def _get_meta(self):
        meta_dicts = {
            'filter_methods': self.filter_methods,
            'cols': self.completed_selection_result.get_select_col_names(),
            'need_run': self.need_run,
            "filter_metas": self.meta_list
        }
        meta_protobuf_obj = feature_selection_meta_pb2.FeatureSelectionMeta(
            **meta_dicts)
        return meta_protobuf_obj

    def _get_param(self):
        LOGGER.debug(
            "curt_select_properties.left_col_name: {}, completed_selection_result: {}"
            .format(self.curt_select_properties.left_col_names,
                    self.completed_selection_result.all_left_col_names))
        LOGGER.debug("Length of left cols: {}".format(
            len(self.completed_selection_result.all_left_col_names)))
        # left_cols = {x: True for x in self.curt_select_properties.left_col_names}
        left_cols = {
            x: True
            for x in self.completed_selection_result.all_left_col_names
        }
        final_left_cols = feature_selection_param_pb2.LeftCols(
            original_cols=self.completed_selection_result.get_select_col_names(
            ),
            left_cols=left_cols)

        host_col_names = []
        if self.role == consts.GUEST:
            for host_id, this_host_name in enumerate(
                    self.completed_selection_result.get_host_sorted_col_names(
                    )):
                party_id = self.component_properties.host_party_idlist[host_id]
                LOGGER.debug(
                    "In _get_param, this_host_name: {}, party_id: {}".format(
                        this_host_name, party_id))

                host_col_names.append(
                    feature_selection_param_pb2.HostColNames(
                        col_names=this_host_name, party_id=str(party_id)))
        else:
            party_id = self.component_properties.local_partyid
            anonymous_names = [
                anonymous_generator.generate_anonymous(fid, model=self)
                for fid in range(len(self.header))
            ]
            host_col_names.append(
                feature_selection_param_pb2.HostColNames(
                    col_names=anonymous_names, party_id=str(party_id)))

        result_obj = feature_selection_param_pb2.FeatureSelectionParam(
            results=self.completed_selection_result.filter_results,
            final_left_cols=final_left_cols,
            col_names=self.completed_selection_result.get_sorted_col_names(),
            host_col_names=host_col_names,
            header=self.curt_select_properties.header)

        # json_result = json_format.MessageToJson(result_obj)
        # LOGGER.debug("json_result: {}".format(json_result))
        return result_obj

    def save_data(self):
        return self.data_output

    def export_model(self):
        LOGGER.debug("Model output is : {}".format(self.model_output))
        if self.model_output is not None:
            LOGGER.debug("model output is already exist, return directly")
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj}
        self.model_output = result
        return result

    def _load_selection_model(self, model_dict):
        LOGGER.debug("Feature selection need run: {}".format(self.need_run))
        if not self.need_run:
            return
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.model_output = {
            MODEL_META_NAME: model_meta,
            MODEL_PARAM_NAME: model_param
        }

        header = list(model_param.header)
        # self.schema = {'header': header}
        self.header = header
        self.curt_select_properties.set_header(header)
        self.completed_selection_result.set_header(header)
        self.curt_select_properties.set_last_left_col_indexes(
            [x for x in range(len(header))])
        self.curt_select_properties.add_select_col_names(header)

        final_left_cols_names = dict(model_param.final_left_cols.left_cols)
        LOGGER.debug("final_left_cols_names: {}".format(final_left_cols_names))
        for col_name, _ in final_left_cols_names.items():
            self.curt_select_properties.add_left_col_name(col_name)
        self.completed_selection_result.add_filter_results(
            filter_name='conclusion',
            select_properties=self.curt_select_properties)
        self.update_curt_select_param()

    def _load_isometric_model(self, iso_model):
        LOGGER.debug(f"In _load_isometric_model, iso_model: {iso_model}")
        for cpn_name, model_dict in iso_model.items():
            model_param = None
            model_meta = None
            for name, model_pb in model_dict.items():
                if name.endswith("Param"):
                    model_param = model_pb
                else:
                    model_meta = model_pb
            model_name = model_param.model_name
            if model_name in self.isometric_models:
                raise ValueError(
                    "Should not load two same type isometric models"
                    " in feature selection")
            adapter = adapter_factory(model_name)
            this_iso_model = adapter.convert(model_meta, model_param)
            # LOGGER.debug(f"model_name: {model_name},"
            #              f" iso_model: {this_iso_model._metric_info[0].__dict__}")
            self.isometric_models[model_name] = this_iso_model

        # for model_name, model_dict in iso_model.items():

    def load_model(self, model_dict):
        LOGGER.debug(f"In load_model, model_dict: {model_dict}")
        if 'model' in model_dict:
            self._load_selection_model(model_dict)

        if 'isometric_model' in model_dict:
            self._load_isometric_model(model_dict['isometric_model'])

    @staticmethod
    def select_cols(instance, left_col_idx):
        instance.features = instance.features[left_col_idx]
        return instance

    def _transfer_data(self, data_instances):

        before_one_data = data_instances.first()
        f = functools.partial(
            self.select_cols,
            left_col_idx=self.completed_selection_result.all_left_col_indexes)

        new_data = data_instances.mapValues(f)

        LOGGER.debug("When transfering, all left_col_names: {}".format(
            self.completed_selection_result.all_left_col_names))
        new_data = self.set_schema(
            new_data, self.completed_selection_result.all_left_col_names)

        one_data = new_data.first()[1]
        LOGGER.debug(
            "In feature selection transform, Before transform: {}, length: {} After transform: {}, length: {}"
            .format(before_one_data[1].features,
                    len(before_one_data[1].features), one_data.features,
                    len(one_data.features)))

        return new_data

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        self.check_schema_content(data_instances.schema)

    def set_schema(self, data_instance, header=None):
        if header is None:
            self.schema["header"] = self.curt_select_properties.header
        else:
            self.schema["header"] = header
        data_instance.schema = self.schema
        return data_instance

    def update_curt_select_param(self):
        new_select_properties = SelectionProperties()
        new_select_properties.set_header(self.curt_select_properties.header)
        new_select_properties.set_last_left_col_indexes(
            self.curt_select_properties.all_left_col_indexes)
        new_select_properties.add_select_col_names(
            self.curt_select_properties.left_col_names)
        self.curt_select_properties = new_select_properties

    def _filter(self, data_instances, method, suffix, idx=0):
        this_filter = filter_factory.get_filter(filter_name=method,
                                                model_param=self.model_param,
                                                role=self.role,
                                                model=self,
                                                idx=idx)
        if method == consts.STATISTIC_FILTER:
            method = self.model_param.statistic_param.metrics[idx]
        elif method == consts.IV_FILTER:
            metric = self.model_param.iv_param.metrics[idx]
            f_type = self.model_param.iv_param.filter_type[idx]
            method = f"{metric}_{f_type}"
        elif method == consts.PSI_FILTER:
            metric = self.model_param.psi_param.metrics[idx]
            f_type = self.model_param.psi_param.filter_type[idx]
            method = f"{metric}_{f_type}"
        this_filter.set_selection_properties(self.curt_select_properties)

        this_filter.set_transfer_variable(self.transfer_variable)
        self.curt_select_properties = this_filter.fit(
            data_instances, suffix).selection_properties
        host_select_properties = getattr(this_filter,
                                         'host_selection_properties', None)
        if host_select_properties is not None:
            LOGGER.debug("method: {}, host_select_properties: {}".format(
                method, host_select_properties[0].all_left_col_names))

        self.completed_selection_result.add_filter_results(
            filter_name=method,
            select_properties=self.curt_select_properties,
            host_select_properties=host_select_properties)
        last_col_nums = len(self.curt_select_properties.last_left_col_names)
        left_col_names = self.curt_select_properties.left_col_names
        self.add_summary(
            method, {
                "last_col_nums": last_col_nums,
                "left_col_nums": len(left_col_names),
                "left_col_names": left_col_names
            })
        LOGGER.debug("method: {}, selection_cols: {}, left_cols: {}".format(
            method, self.curt_select_properties.select_col_names,
            self.curt_select_properties.left_col_names))
        self.update_curt_select_param()
        LOGGER.debug("After updated, method: {}, selection_cols: {}".format(
            method, self.curt_select_properties.select_col_names))
        # self.meta_dicts = this_filter.get_meta_obj(self.meta_dicts)
        self.meta_list.append(this_filter.get_meta_obj())

    def fit(self, data_instances):
        LOGGER.info("Start Hetero Selection Fit and transform.")
        self._abnormal_detection(data_instances)
        self._init_select_params(data_instances)

        original_col_nums = len(
            self.curt_select_properties.last_left_col_names)

        if len(self.curt_select_properties.select_col_indexes) == 0:
            LOGGER.warning("None of columns has been set to select")
        else:
            for filter_idx, method in enumerate(self.filter_methods):
                if method in [
                        consts.STATISTIC_FILTER, consts.IV_FILTER,
                        consts.PSI_FILTER, consts.HETERO_SBT_FILTER,
                        consts.HOMO_SBT_FILTER, consts.HETERO_FAST_SBT_FILTER
                ]:
                    if method == consts.STATISTIC_FILTER:
                        metrics = self.model_param.statistic_param.metrics
                    elif method == consts.IV_FILTER:
                        metrics = self.model_param.iv_param.metrics
                    elif method == consts.PSI_FILTER:
                        metrics = self.model_param.psi_param.metrics
                    elif method in [
                            consts.HETERO_SBT_FILTER, consts.HOMO_SBT_FILTER,
                            consts.HETERO_FAST_SBT_FILTER
                    ]:
                        metrics = self.model_param.sbt_param.metrics
                    else:
                        raise ValueError(f"method: {method} is not supported")
                    for idx, _ in enumerate(metrics):
                        self._filter(data_instances,
                                     method,
                                     suffix=(str(filter_idx), str(idx)),
                                     idx=idx)
                else:
                    self._filter(data_instances,
                                 method,
                                 suffix=str(filter_idx))

        last_col_nums = self.curt_select_properties.last_left_col_names

        self.add_summary(
            "all", {
                "last_col_nums": original_col_nums,
                "left_col_nums": len(last_col_nums),
                "left_col_names": last_col_nums
            })

        new_data = self._transfer_data(data_instances)
        LOGGER.debug(f"Final summary: {self.summary()}")
        LOGGER.info("Finish Hetero Selection Fit and transform.")
        return new_data

    @assert_io_num_rows_equal
    @assert_schema_consistent
    def transform(self, data_instances):
        self._abnormal_detection(data_instances)
        self._init_select_params(data_instances)
        new_data = self._transfer_data(data_instances)
        return new_data
class BaseHeteroFeatureSelection(ModelBase):
    def __init__(self):
        super(BaseHeteroFeatureSelection, self).__init__()
        self.transfer_variable = HeteroFeatureSelectionTransferVariable()

        self.curt_select_properties = SelectionProperties()
        self.completed_selection_result = CompletedSelectionResults()

        self.schema = None
        self.header = None
        self.party_name = 'Base'
        # Possible previous model
        self.binning_model = None
        self.static_obj = None
        self.model_param = FeatureSelectionParam()
        self.meta_dicts = {}

    def _init_model(self, params):
        self.model_param = params
        # self.cols_index = params.select_cols
        self.filter_methods = params.filter_methods
        # self.local_only = params.local_only

    def _init_select_params(self, data_instances):
        if self.schema is None:
            self.schema = data_instances.schema

        if self.header is not None:
            return
        self.schema = data_instances.schema
        header = get_header(data_instances)
        self.header = header
        self.curt_select_properties.set_header(header)
        self.curt_select_properties.set_last_left_col_indexes(
            [x for x in range(len(header))])
        if self.model_param.select_col_indexes == -1:
            self.curt_select_properties.set_select_all_cols()
        else:
            self.curt_select_properties.add_select_col_indexes(
                self.model_param.select_col_indexes)
        self.curt_select_properties.add_select_col_names(
            self.model_param.select_names)
        self.completed_selection_result.set_header(header)
        self.completed_selection_result.set_select_col_names(
            self.curt_select_properties.select_col_names)
        self.completed_selection_result.set_all_left_col_indexes(
            self.curt_select_properties.all_left_col_indexes)

    def _get_meta(self):
        self.meta_dicts['filter_methods'] = self.filter_methods
        self.meta_dicts[
            'cols'] = self.completed_selection_result.get_select_col_names()
        self.meta_dicts['need_run'] = self.need_run
        meta_protobuf_obj = feature_selection_meta_pb2.FeatureSelectionMeta(
            **self.meta_dicts)
        return meta_protobuf_obj

    def _get_param(self):
        LOGGER.debug(
            "curt_select_properties.left_col_name: {}, completed_selection_result: {}"
            .format(self.curt_select_properties.left_col_names,
                    self.completed_selection_result.all_left_col_names))
        LOGGER.debug("Length of left cols: {}".format(
            len(self.completed_selection_result.all_left_col_names)))
        # left_cols = {x: True for x in self.curt_select_properties.left_col_names}
        left_cols = {
            x: True
            for x in self.completed_selection_result.all_left_col_names
        }
        final_left_cols = feature_selection_param_pb2.LeftCols(
            original_cols=self.completed_selection_result.get_select_col_names(
            ),
            left_cols=left_cols)

        host_col_names = []
        for host_id, this_host_name in enumerate(
                self.completed_selection_result.get_host_sorted_col_names()):
            party_id = self.component_properties.host_party_idlist[host_id]
            LOGGER.debug(
                "In _get_param, this_host_name: {}, party_id: {}".format(
                    this_host_name, party_id))

            host_col_names.append(
                feature_selection_param_pb2.HostColNames(
                    col_names=this_host_name, party_id=str(party_id)))

        result_obj = feature_selection_param_pb2.FeatureSelectionParam(
            results=self.completed_selection_result.filter_results,
            final_left_cols=final_left_cols,
            col_names=self.completed_selection_result.get_sorted_col_names(),
            host_col_names=host_col_names,
            header=self.curt_select_properties.header)

        json_result = json_format.MessageToJson(result_obj)
        LOGGER.debug("json_result: {}".format(json_result))
        return result_obj

    def save_data(self):
        return self.data_output

    def export_model(self):
        LOGGER.debug("Model output is : {}".format(self.model_output))
        if self.model_output is not None:
            LOGGER.debug("model output is already exist, return directly")
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj}
        self.model_output = result
        return result

    def load_model(self, model_dict):

        if 'model' in model_dict:
            # self._parse_need_run(model_dict, MODEL_META_NAME)
            LOGGER.debug("Feature selection need run: {}".format(
                self.need_run))
            if not self.need_run:
                return
            model_param = list(
                model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
            model_meta = list(
                model_dict.get('model').values())[0].get(MODEL_META_NAME)

            self.model_output = {
                MODEL_META_NAME: model_meta,
                MODEL_PARAM_NAME: model_param
            }

            header = list(model_param.header)
            # self.schema = {'header': header}
            self.header = header
            self.curt_select_properties.set_header(header)
            self.completed_selection_result.set_header(header)
            self.curt_select_properties.set_last_left_col_indexes(
                [x for x in range(len(header))])
            self.curt_select_properties.add_select_col_names(header)

            final_left_cols_names = dict(model_param.final_left_cols.left_cols)
            LOGGER.debug(
                "final_left_cols_names: {}".format(final_left_cols_names))
            for col_name, _ in final_left_cols_names.items():
                self.curt_select_properties.add_left_col_name(col_name)
            self.completed_selection_result.add_filter_results(
                filter_name='conclusion',
                select_properties=self.curt_select_properties)
            self.update_curt_select_param()
            LOGGER.debug(
                "After load model, completed_selection_result.all_left_col_indexes: {}"
                .format(self.completed_selection_result.all_left_col_indexes))

        if 'isometric_model' in model_dict:

            LOGGER.debug(
                "Has isometric_model, model_dict: {}".format(model_dict))
            if self.party_name == consts.GUEST:
                self.binning_model = HeteroFeatureBinningGuest()
            else:
                self.binning_model = HeteroFeatureBinningHost()

            new_model_dict = {'model': model_dict['isometric_model']}
            self.binning_model.load_model(new_model_dict)

    @staticmethod
    def select_cols(instance, left_col_idx):
        instance.features = instance.features[left_col_idx]
        return instance

    def _transfer_data(self, data_instances):

        before_one_data = data_instances.first()
        f = functools.partial(
            self.select_cols,
            left_col_idx=self.completed_selection_result.all_left_col_indexes)

        new_data = data_instances.mapValues(f)

        LOGGER.debug("When transfering, all left_col_names: {}".format(
            self.completed_selection_result.all_left_col_names))
        new_data = self.set_schema(
            new_data, self.completed_selection_result.all_left_col_names)

        one_data = new_data.first()[1]
        LOGGER.debug(
            "In feature selection transform, Before transform: {}, length: {} After transform: {}, length: {}"
            .format(before_one_data[1].features,
                    len(before_one_data[1].features), one_data.features,
                    len(one_data.features)))

        return new_data

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)

    def set_schema(self, data_instance, header=None):
        if header is None:
            self.schema["header"] = self.curt_select_properties.header
        else:
            self.schema["header"] = header
        data_instance.schema = self.schema
        return data_instance

    def update_curt_select_param(self):
        new_select_properties = SelectionProperties()
        new_select_properties.set_header(self.curt_select_properties.header)
        new_select_properties.set_last_left_col_indexes(
            self.curt_select_properties.all_left_col_indexes)
        new_select_properties.add_select_col_names(
            self.curt_select_properties.left_col_names)
        LOGGER.debug("In update_curt_select_param, header: {}, cols_map: {},"
                     "last_left_col_indexes: {}, select_col_names: {}".format(
                         new_select_properties.header,
                         new_select_properties.col_name_maps,
                         new_select_properties.last_left_col_indexes,
                         new_select_properties.select_col_names))
        self.curt_select_properties = new_select_properties

    def _filter(self, data_instances, method, suffix):
        this_filter = filter_factory.get_filter(filter_name=method,
                                                model_param=self.model_param,
                                                role=self.role)
        this_filter.set_selection_properties(self.curt_select_properties)
        this_filter.set_statics_obj(self.static_obj)
        this_filter.set_binning_obj(self.binning_model)
        this_filter.set_transfer_variable(self.transfer_variable)
        self.curt_select_properties = this_filter.fit(
            data_instances, suffix).selection_properties
        host_select_properties = getattr(this_filter,
                                         'host_selection_properties', None)
        LOGGER.debug("method: {}, host_select_properties: {}".format(
            method, host_select_properties))

        self.completed_selection_result.add_filter_results(
            filter_name=method,
            select_properties=self.curt_select_properties,
            host_select_properties=host_select_properties)
        LOGGER.debug("method: {}, selection_cols: {}, left_cols: {}".format(
            method, self.curt_select_properties.select_col_names,
            self.curt_select_properties.left_col_names))
        self.update_curt_select_param()
        LOGGER.debug(
            "After updated, method: {}, selection_cols: {}, left_cols: {}".
            format(method, self.curt_select_properties.select_col_names,
                   self.curt_select_properties.left_col_names))
        self.meta_dicts = this_filter.get_meta_obj(self.meta_dicts)

    def fit(self, data_instances):
        LOGGER.info("Start Hetero Selection Fit and transform.")
        self._abnormal_detection(data_instances)
        self._init_select_params(data_instances)

        if len(self.curt_select_properties.select_col_indexes) == 0:
            LOGGER.warning("None of columns has been set to select")
        else:
            for filter_idx, method in enumerate(self.filter_methods):
                self._filter(data_instances, method, suffix=str(filter_idx))

        new_data = self._transfer_data(data_instances)
        LOGGER.info("Finish Hetero Selection Fit and transform.")
        return new_data

    @assert_io_num_rows_equal
    def transform(self, data_instances):
        self._abnormal_detection(data_instances)
        self._init_select_params(data_instances)
        new_data = self._transfer_data(data_instances)
        return new_data