예제 #1
0
 def _default_setting(self, header):
     if self.bin_inner_param is not None:
         return
     bin_inner_param = BinInnerParam()
     bin_inner_param.set_header(header)
     bin_inner_param.set_bin_all()
     bin_inner_param.set_transform_all()
     self.set_bin_inner_param(bin_inner_param)
예제 #2
0
class BaseBinning(object):
    """
    This is use for discrete data so that can transform data or use information for feature selection.
    """
    def __init__(self, params=None, abnormal_list=None, labels=None):
        self.bin_inner_param: BinInnerParam = None
        self.is_multi_class = False
        self.bin_results = SplitPointsResult()
        if params is None:
            return
        self.params = params
        self.bin_num = params.bin_num
        if abnormal_list is None:
            self.abnormal_list = []
        else:
            self.abnormal_list = abnormal_list
        self.split_points = None

    @property
    def header(self):
        return self.bin_inner_param.header

    # @property
    # def split_points(self):
    #     return self.bin_results.all_split_points

    def _default_setting(self, header):
        if self.bin_inner_param is not None:
            return
        self.bin_inner_param = BinInnerParam()

        self.bin_inner_param.set_header(header)
        if self.params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(self.params.bin_indexes)
            self.bin_inner_param.add_bin_names(self.params.bin_names)

        self.bin_inner_param.add_category_indexes(self.params.category_indexes)
        self.bin_inner_param.add_category_names(self.params.category_names)

        if self.params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                self.params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                self.params.transform_param.transform_names)

    def fit_split_points(self, data_instances):
        """
        Get split points

        Parameters
        ----------
        data_instances : Table
            The input data

        Returns
        -------

        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        """
        pass

    def fit_category_features(self, data_instances):
        is_sparse = data_overview.is_sparse_data(data_instances)

        if len(self.bin_inner_param.category_indexes) > 0:
            statics_obj = data_overview.DataStatistics()
            category_col_values = statics_obj.static_all_values(
                data_instances, self.bin_inner_param.category_indexes,
                is_sparse)
            for col_name, split_points in zip(
                    self.bin_inner_param.category_names, category_col_values):
                self.bin_results.put_col_split_points(col_name, split_points)

    def set_bin_inner_param(self, bin_inner_param):
        self.bin_inner_param = bin_inner_param

    def transform(self, data_instances, transform_type):
        # self._init_cols(data_instances)
        for col_name in self.bin_inner_param.transform_bin_names:
            if col_name not in self.header:
                raise ValueError(
                    "Transform col_name: {} is not existed".format(col_name))

        if transform_type == 'bin_num':
            data_instances, _, _ = self.convert_feature_to_bin(data_instances)
        elif transform_type == 'woe':
            data_instances = self.convert_feature_to_woe(data_instances)

        return data_instances

    @staticmethod
    def get_data_bin(data_instances, split_points, bin_cols_map):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : Table
            The input data

        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        Returns
        -------
        data_bin_table : Table.

            Each element represent for the corresponding bin number this feature belongs to.
            e.g. it could be:
            [{'x1': 1, 'x2': 5, 'x3': 2}
            ...
             ]
        """
        # self._init_cols(data_instances)
        is_sparse = data_overview.is_sparse_data(data_instances)
        header = data_instances.schema.get('header')

        f = functools.partial(BaseBinning.bin_data,
                              split_points=split_points,
                              cols_dict=bin_cols_map,
                              header=header,
                              is_sparse=is_sparse)
        data_bin_dict = data_instances.mapValues(f)
        return data_bin_dict

    def convert_feature_to_woe(self, data_instances):
        is_sparse = data_overview.is_sparse_data(data_instances)
        schema = data_instances.schema

        if is_sparse:
            f = functools.partial(self._convert_sparse_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='woe')
            new_data = data_instances.mapValues(f)
        else:
            f = functools.partial(self._convert_dense_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='woe')
            new_data = data_instances.mapValues(f)
        new_data.schema = schema
        return new_data

    def convert_feature_to_bin(self, data_instances, split_points=None):
        is_sparse = data_overview.is_sparse_data(data_instances)
        schema = data_instances.schema

        if split_points is None:
            split_points = self.bin_results.all_split_points
        else:
            for col_name, sp in split_points.items():
                self.bin_results.put_col_split_points(col_name, sp)

        if is_sparse:
            f = functools.partial(self._convert_sparse_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='bin_num')
            new_data = data_instances.mapValues(f)
        else:
            f = functools.partial(self._convert_dense_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='bin_num')
            new_data = data_instances.mapValues(f)
        new_data.schema = schema
        header = get_header(data_instances)
        bin_sparse = self.get_sparse_bin(
            self.bin_inner_param.transform_bin_indexes, split_points, header)
        split_points_result = self.bin_results.get_split_points_array(
            self.bin_inner_param.transform_bin_names)

        return new_data, split_points_result, bin_sparse

    def _setup_bin_inner_param(self, data_instances, params):
        if self.bin_inner_param is not None:
            return
        self.bin_inner_param = BinInnerParam()

        header = get_header(data_instances)
        LOGGER.debug("_setup_bin_inner_param, get header length: {}".format(
            len(self.header)))

        self.schema = data_instances.schema
        self.bin_inner_param.set_header(header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                params.transform_param.transform_names)
        self.set_bin_inner_param(self.bin_inner_param)

    @staticmethod
    def _convert_sparse_data(instances,
                             bin_inner_param: BinInnerParam,
                             bin_results: SplitPointsResult,
                             abnormal_list: list,
                             convert_type: str = 'bin_num'):
        instances = copy.deepcopy(instances)
        all_data = instances.features.get_all_data()
        data_shape = instances.features.get_shape()
        indice = []
        sparse_value = []
        transform_cols_idx = bin_inner_param.transform_bin_indexes
        split_points_dict = bin_results.all_split_points

        for col_idx, col_value in all_data:
            if col_idx in transform_cols_idx:
                if col_value in abnormal_list:
                    indice.append(col_idx)
                    sparse_value.append(col_value)
                    continue
                # Maybe it is because missing value add in sparse value, but
                col_name = bin_inner_param.header[col_idx]
                split_points = split_points_dict[col_name]
                bin_num = BaseBinning.get_bin_num(col_value, split_points)
                indice.append(col_idx)
                if convert_type == 'bin_num':
                    sparse_value.append(bin_num)
                else:
                    sparse_value.append(col_value)
            else:
                indice.append(col_idx)
                sparse_value.append(col_value)

        sparse_vector = SparseVector(indice, sparse_value, data_shape)
        instances.features = sparse_vector
        return instances

    @staticmethod
    def get_sparse_bin(transform_cols_idx, split_points_dict, header):
        """
        Get which bins the 0 located at for each column.

        Returns
        -------
        Dict of sparse bin num
            {0: 2, 1: 3, 2:5 ... }
        """
        result = {}
        for col_idx in transform_cols_idx:
            col_name = header[col_idx]
            split_points = split_points_dict[col_name]
            sparse_bin_num = BaseBinning.get_bin_num(0, split_points)
            result[col_idx] = sparse_bin_num
        return result

    @staticmethod
    def _convert_dense_data(instances,
                            bin_inner_param: BinInnerParam,
                            bin_results: SplitPointsResult,
                            abnormal_list: list,
                            convert_type: str = 'bin_num'):
        instances = copy.deepcopy(instances)
        features = instances.features
        transform_cols_idx = bin_inner_param.transform_bin_indexes
        split_points_dict = bin_results.all_split_points

        transform_cols_idx_set = set(transform_cols_idx)

        for col_idx, col_value in enumerate(features):
            if col_idx in transform_cols_idx_set:
                if col_value in abnormal_list:
                    features[col_idx] = col_value
                    continue
                col_name = bin_inner_param.header[col_idx]
                split_points = split_points_dict[col_name]
                bin_num = BaseBinning.get_bin_num(col_value, split_points)
                if convert_type == 'bin_num':
                    features[col_idx] = bin_num
                else:
                    features[col_idx] = col_value

        instances.features = features
        return instances

    @staticmethod
    def convert_bin_counts_table(result_counts, idx):
        """
        Given event count information calculate iv information

        Parameters
        ----------
        result_counts: table.
            It is like:
                ('x1': [[label_0_count, label_1_count, ...], [label_0_count, label_1_count, ...] ... ],
                 'x2': [[label_0_count, label_1_count, ...], [label_0_count, label_1_count, ...] ... ],
                 ...
                )

        idx: int

        Returns
        -------
        ('x1': [[event_count, non_event_count], [event_count, non_event_count] ... ],
         'x2': [[event_count, non_event_count], [event_count, non_event_count] ... ],
         ...
        )
        """
        def _convert(list_counts):
            res = []
            for c_array in list_counts:
                event_count = c_array[idx]
                non_event_count = np.sum(c_array) - event_count
                res.append([event_count, non_event_count])
            return res

        return result_counts.mapValues(_convert)

    @staticmethod
    def fill_sparse_result(col_name, static_nums, sparse_bin_points,
                           label_counts):
        """
        Parameters
        ----------
        static_nums :  list.
            It is like:
                [[event_count, total_num], [event_count, total_num] ... ]


        sparse_bin_points : dict
            Dict of sparse bin num
                {"x1": 2, "x2": 3, "x3": 5 ... }

        label_counts: np.array
            eg. [100, 200, ...]

        Returns
        -------
        The format is same as result_counts.
        """

        curt_all = functools.reduce(lambda x, y: x + y, static_nums)
        sparse_bin = sparse_bin_points.get(col_name)
        static_nums[sparse_bin] = label_counts - curt_all
        return col_name, static_nums

    @staticmethod
    def bin_data(instance, split_points, cols_dict, header, is_sparse):
        """
        Apply the binning method

        Parameters
        ----------
        instance : Table
            The input data

        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        cols_dict: dict
            Record key, value pairs where key is cols' name, and value is cols' index.

        header: list
            header of Table

        is_sparse: bool
            Specify whether it is sparse data or not

        Returns
        -------
        result_bin_dict : dict.
            Each element represent for the corresponding bin number this feature belongs to.
            e.g. it could be:
            [{1: 1, 2: 5, 3: 2}
            ...
             ]  # Each number represent for the bin number it belongs to.
        """

        result_bin_nums = {}

        if is_sparse:
            sparse_data = instance.features.get_all_data()
            for col_idx, col_value in sparse_data:
                col_name = header[col_idx]

                if col_name in cols_dict:
                    col_split_points = split_points[col_name]
                    col_bin_num = BaseBinning.get_bin_num(
                        col_value, col_split_points)
                    result_bin_nums[col_name] = col_bin_num
            return result_bin_nums

        # For dense data
        for col_name, col_index in cols_dict.items():
            col_split_points = split_points[col_name]

            value = instance.features[col_index]
            col_bin_num = BaseBinning.get_bin_num(value, col_split_points)
            result_bin_nums[col_name] = col_bin_num

        return result_bin_nums

    @staticmethod
    def get_bin_num(value, split_points):
        if np.isnan(value):
            return len(split_points)
        sp = split_points[:-1]
        col_bin_num = bisect.bisect_left(sp, value)
        # col_bin_num = bisect.bisect_left(split_points, value)
        return col_bin_num

    @staticmethod
    def add_label_in_partition_bak(data_bin_with_table, sparse_bin_points):
        """
        Add all label, so that become convenient to calculate woe and iv

        Parameters
        ----------
        data_bin_with_table : Table
            The input data, the Table is like:
            (id, {'x1': 1, 'x2': 5, 'x3': 2}, y)

        sparse_bin_points: dict
            Dict of sparse bin num
                {0: 2, 1: 3, 2:5 ... }

        Returns
        -------
        result_sum: the result Table. It is like:
            {'x1': [[event_count, total_num], [event_count, total_num] ... ],
             'x2': [[event_count, total_num], [event_count, total_num] ... ],
             ...
            }

        """
        result_sum = {}
        for _, datas in data_bin_with_table:
            bin_idx_dict = datas[0]
            y = datas[1]

            # y = y_combo[0]
            # inverse_y = y_combo[1]
            for col_name, bin_idx in bin_idx_dict.items():
                result_sum.setdefault(col_name, [])
                col_sum = result_sum[col_name]
                while bin_idx >= len(col_sum):
                    col_sum.append([0, 0])
                if bin_idx == sparse_bin_points[col_name]:
                    continue
                label_sum = col_sum[bin_idx]
                label_sum[0] = label_sum[0] + y
                label_sum[1] = label_sum[1] + 1
                col_sum[bin_idx] = label_sum
                result_sum[col_name] = col_sum

        return list(result_sum.items())
예제 #3
0
class BaseBinning(object):
    """
    This is use for discrete data so that can transform data or use information for feature selection.
    """
    def __init__(self, params=None, abnormal_list=None):
        self.bin_inner_param: BinInnerParam = None
        self.bin_results = BinResults()
        if params is None:
            return
        self.params = params
        self.bin_num = params.bin_num
        self.event_total = None
        self.non_event_total = None

        if abnormal_list is None:
            self.abnormal_list = []
        else:
            self.abnormal_list = abnormal_list

    def set_role_party(self, role, party_id):
        self.bin_results.set_role_party(role, party_id)

    @property
    def header(self):
        return self.bin_inner_param.header

    @property
    def split_points(self):
        return self.bin_results.all_split_points

    def _default_setting(self, header):
        if self.bin_inner_param is not None:
            return
        bin_inner_param = BinInnerParam()
        bin_inner_param.set_header(header)
        bin_inner_param.set_bin_all()
        bin_inner_param.set_transform_all()
        self.set_bin_inner_param(bin_inner_param)

    def fit_split_points(self, data_instances):
        """
        Get split points

        Parameters
        ----------
        data_instances : DTable
            The input data

        Returns
        -------

        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        """
        pass

    def fit_category_features(self, data_instances):
        is_sparse = data_overview.is_sparse_data(data_instances)

        if len(self.bin_inner_param.category_indexes) > 0:
            statics_obj = data_overview.DataStatistics()
            category_col_values = statics_obj.static_all_values(
                data_instances, self.bin_inner_param.category_indexes,
                is_sparse)
            for col_name, split_points in zip(
                    self.bin_inner_param.category_names, category_col_values):
                self.bin_results.put_col_split_points(col_name, split_points)

    def set_bin_inner_param(self, bin_inner_param):
        self.bin_inner_param = bin_inner_param

    def transform(self, data_instances, transform_type):
        # self._init_cols(data_instances)
        for col_name in self.bin_inner_param.transform_bin_names:
            if col_name not in self.header:
                raise ValueError(
                    "Transform col_name: {} is not existed".format(col_name))

        if transform_type == 'bin_num':
            data_instances, _, _ = self.convert_feature_to_bin(data_instances)
        elif transform_type == 'woe':
            data_instances = self.convert_feature_to_woe(data_instances)

        return data_instances

    def get_data_bin(self, data_instances, split_points=None):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        Returns
        -------
        data_bin_table : DTable.

            Each element represent for the corresponding bin number this feature belongs to.
            e.g. it could be:
            [{'x1': 1, 'x2': 5, 'x3': 2}
            ...
             ]
        """
        # self._init_cols(data_instances)
        is_sparse = data_overview.is_sparse_data(data_instances)

        if split_points is None:
            split_points = self.fit_split_points(data_instances)

        f = functools.partial(self.bin_data,
                              split_points=split_points,
                              cols_dict=self.bin_inner_param.bin_cols_map,
                              header=self.header,
                              is_sparse=is_sparse)
        data_bin_dict = data_instances.mapValues(f)
        return data_bin_dict

    def convert_feature_to_woe(self, data_instances):
        is_sparse = data_overview.is_sparse_data(data_instances)
        schema = data_instances.schema

        if is_sparse:
            f = functools.partial(self._convert_sparse_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='woe')
            new_data = data_instances.mapValues(f)
        else:
            f = functools.partial(self._convert_dense_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='woe')
            new_data = data_instances.mapValues(f)
        new_data.schema = schema
        return new_data

    def convert_feature_to_bin(self, data_instances, split_points=None):
        is_sparse = data_overview.is_sparse_data(data_instances)
        schema = data_instances.schema

        if split_points is None:
            split_points = self.bin_results.all_split_points
        else:
            for col_name, sp in split_points.items():
                self.bin_results.put_col_split_points(col_name, sp)

        if is_sparse:
            f = functools.partial(self._convert_sparse_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='bin_num')
            new_data = data_instances.mapValues(f)
        else:
            f = functools.partial(self._convert_dense_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='bin_num')
            new_data = data_instances.mapValues(f)
        new_data.schema = schema
        bin_sparse = self.get_sparse_bin(
            self.bin_inner_param.transform_bin_indexes, split_points)
        split_points_result = self.bin_results.get_split_points_array(
            self.bin_inner_param.transform_bin_names)

        return new_data, split_points_result, bin_sparse

    def _setup_bin_inner_param(self, data_instances, params):
        if self.bin_inner_param is not None:
            return
        self.bin_inner_param = BinInnerParam()

        header = get_header(data_instances)
        LOGGER.debug("_setup_bin_inner_param, get header length: {}".format(
            len(self.header)))

        self.schema = data_instances.schema
        self.bin_inner_param.set_header(header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                params.transform_param.transform_names)
        self.set_bin_inner_param(self.bin_inner_param)

    @staticmethod
    def _convert_sparse_data(instances,
                             bin_inner_param: BinInnerParam,
                             bin_results: BinResults,
                             abnormal_list: list,
                             convert_type: str = 'bin_num'):
        instances = copy.deepcopy(instances)
        all_data = instances.features.get_all_data()
        data_shape = instances.features.get_shape()
        indice = []
        sparse_value = []
        transform_cols_idx = bin_inner_param.transform_bin_indexes
        split_points_dict = bin_results.all_split_points

        for col_idx, col_value in all_data:
            if col_idx in transform_cols_idx:
                if col_value in abnormal_list:
                    indice.append(col_idx)
                    sparse_value.append(col_value)
                    continue
                # Maybe it is because missing value add in sparse value, but
                col_name = bin_inner_param.header[col_idx]
                split_points = split_points_dict[col_name]
                bin_num = BaseBinning.get_bin_num(col_value, split_points)
                indice.append(col_idx)
                if convert_type == 'bin_num':
                    sparse_value.append(bin_num)
                elif convert_type == 'woe':
                    col_results = bin_results.all_cols_results.get(col_name)
                    woe_value = col_results.woe_array[bin_num]
                    sparse_value.append(woe_value)
                else:
                    sparse_value.append(col_value)
            else:
                indice.append(col_idx)
                sparse_value.append(col_value)

        sparse_vector = SparseVector(indice, sparse_value, data_shape)
        instances.features = sparse_vector
        return instances

    def get_sparse_bin(self, transform_cols_idx, split_points_dict):
        """
        Get which bins the 0 located at for each column.

        Returns
        -------
        Dict of sparse bin num
            {0: 2, 1: 3, 2:5 ... }
        """
        result = {}
        for col_idx in transform_cols_idx:
            col_name = self.header[col_idx]
            split_points = split_points_dict[col_name]
            sparse_bin_num = self.get_bin_num(0, split_points)
            result[col_idx] = sparse_bin_num
        return result

    @staticmethod
    def _convert_dense_data(instances,
                            bin_inner_param: BinInnerParam,
                            bin_results: BinResults,
                            abnormal_list: list,
                            convert_type: str = 'bin_num'):
        instances = copy.deepcopy(instances)
        features = instances.features
        transform_cols_idx = bin_inner_param.transform_bin_indexes
        split_points_dict = bin_results.all_split_points

        transform_cols_idx_set = set(transform_cols_idx)

        for col_idx, col_value in enumerate(features):
            if col_idx in transform_cols_idx_set:
                if col_value in abnormal_list:
                    features[col_idx] = col_value
                    continue
                col_name = bin_inner_param.header[col_idx]
                split_points = split_points_dict[col_name]
                bin_num = BaseBinning.get_bin_num(col_value, split_points)
                if convert_type == 'bin_num':
                    features[col_idx] = bin_num
                elif convert_type == 'woe':
                    col_results = bin_results.all_cols_results.get(col_name)
                    woe_value = col_results.woe_array[bin_num]
                    features[col_idx] = woe_value
                else:
                    features[col_idx] = col_value

        instances.features = features
        return instances

    def cal_local_iv(self,
                     data_instances,
                     label_counts,
                     split_points=None,
                     label_table=None):
        """
        Calculate iv attributes

        Parameters
        ----------
        data_instances : DTable
            The input data

        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        label_table : DTable
            id with labels

        Returns
        -------
        Dict of IVAttributes object

        """
        # self._init_cols(data_instances)

        if split_points is None and self.split_points is None:
            split_points = self.fit_split_points(data_instances)
        elif split_points is None:
            split_points = self.split_points

        data_bin_table = self.get_data_bin(data_instances, split_points)
        sparse_bin_points = self.get_sparse_bin(
            self.bin_inner_param.bin_indexes, self.split_points)
        sparse_bin_points = {
            self.bin_inner_param.header[k]: v
            for k, v in sparse_bin_points.items()
        }
        if label_table is None:
            label_table = data_instances.mapValues(lambda x: x.label)
        # event_count_table = label_table.mapValues(lambda x: (x, 1 - x))
        data_bin_with_label = data_bin_table.join(label_table, lambda x, y:
                                                  (x, y))
        f = functools.partial(self.add_label_in_partition,
                              sparse_bin_points=sparse_bin_points)

        result_counts = data_bin_with_label.mapReducePartitions(
            f, self.aggregate_partition_label)

        def cal_zeros(bin_results):
            for b in bin_results:
                b[1] = b[1] - b[0]
            return bin_results

        result_counts = result_counts.mapValues(cal_zeros)

        f = functools.partial(self.fill_sparse_result,
                              sparse_bin_points=sparse_bin_points,
                              label_counts=label_counts)
        result_counts = result_counts.map(f)
        self.cal_iv_woe(result_counts, self.params.adjustment_factor)

    @staticmethod
    def fill_sparse_result(col_name, static_nums, sparse_bin_points,
                           label_counts):
        """
        Parameters
        ----------
        static_nums :  list.
            It is like:
                [[event_count, total_num], [event_count, total_num] ... ]


        sparse_bin_points : dict
            Dict of sparse bin num
                {"x1": 2, "x2": 3, "x3": 5 ... }

        label_counts: dict
            eg. {0: 100, 1: 200}

        Returns
        -------
        The format is same as result_counts.
        """

        curt_all = functools.reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]),
                                    static_nums)
        # LOGGER.debug(f"In fill_sparse_result, curt_all: {curt_all}, label_count: {label_counts}")
        sparse_bin = sparse_bin_points.get(col_name)
        static_nums[sparse_bin] = [
            label_counts[1] - curt_all[0], label_counts[0] - curt_all[1]
        ]
        return col_name, static_nums

    @staticmethod
    def bin_data(instance, split_points, cols_dict, header, is_sparse):
        """
        Apply the binning method

        Parameters
        ----------
        instance : DTable
            The input data

        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        cols_dict: dict
            Record key, value pairs where key is cols' name, and value is cols' index.

        is_sparse: bool
            Specify whether it is sparse data or not

        Returns
        -------
        result_bin_dict : dict.
            Each element represent for the corresponding bin number this feature belongs to.
            e.g. it could be:
            [{1: 1, 2: 5, 3: 2}
            ...
             ]  # Each number represent for the bin number it belongs to.
        """

        result_bin_nums = {}

        if is_sparse:
            sparse_data = instance.features.get_all_data()
            for col_idx, col_value in sparse_data:
                col_name = header[col_idx]

                if col_name in cols_dict:
                    col_split_points = split_points[col_name]
                    col_bin_num = BaseBinning.get_bin_num(
                        col_value, col_split_points)
                    result_bin_nums[col_name] = col_bin_num
            return result_bin_nums

        # For dense data
        for col_name, col_index in cols_dict.items():
            col_split_points = split_points[col_name]

            value = instance.features[col_index]
            col_bin_num = BaseBinning.get_bin_num(value, col_split_points)
            result_bin_nums[col_name] = col_bin_num

        return result_bin_nums

    @staticmethod
    def get_bin_num(value, split_points):
        sp = split_points[:-1]
        col_bin_num = bisect.bisect_left(sp, value)
        # col_bin_num = bisect.bisect_left(split_points, value)
        return col_bin_num

    @staticmethod
    def woe_1d(data_event_count, adjustment_factor):
        """
        Given event and non-event count in one column, calculate its woe value.

        Parameters
        ----------
        data_event_count : list
            [(event_sum, non-event_sum), (same sum in second_bin), (in third bin) ...]

        adjustment_factor : float
            The adjustment factor when calculating WOE

        Returns
        -------
        IVAttributes : object
            Stored information that related iv and woe value
        """
        event_total = 0
        non_event_total = 0
        for event_sum, non_event_sum in data_event_count:
            event_total += event_sum
            non_event_total += non_event_sum

        if event_total == 0:
            # raise ValueError("NO event label in target data")
            event_total = 1
        if non_event_total == 0:
            # raise ValueError("NO non-event label in target data")
            non_event_total = 1

        iv = 0
        event_count_array = []
        non_event_count_array = []
        event_rate_array = []
        non_event_rate_array = []
        woe_array = []
        iv_array = []

        for event_count, non_event_count in data_event_count:

            if event_count == 0 or non_event_count == 0:
                event_rate = 1.0 * (event_count +
                                    adjustment_factor) / event_total
                non_event_rate = 1.0 * (non_event_count +
                                        adjustment_factor) / non_event_total
            else:
                event_rate = 1.0 * event_count / event_total
                non_event_rate = 1.0 * non_event_count / non_event_total
            woe_i = math.log(event_rate / non_event_rate)

            event_count_array.append(event_count)
            non_event_count_array.append(non_event_count)
            event_rate_array.append(event_rate)
            non_event_rate_array.append(non_event_rate)
            woe_array.append(woe_i)
            iv_i = (event_rate - non_event_rate) * woe_i
            iv_array.append(iv_i)
            iv += iv_i
        return BinColResults(woe_array=woe_array,
                             iv_array=iv_array,
                             event_count_array=event_count_array,
                             non_event_count_array=non_event_count_array,
                             event_rate_array=event_rate_array,
                             non_event_rate_array=non_event_rate_array,
                             iv=iv)

    def cal_iv_woe(self, result_counts, adjustment_factor):
        """
        Given event count information calculate iv information

        Parameters
        ----------
        result_counts: dict or table.
            It is like:
                {'x1': [[event_count, non_event_count], [event_count, non_event_count] ... ],
                 'x2': [[event_count, non_event_count], [event_count, non_event_count] ... ],
                 ...
                }

        adjustment_factor : float
            The adjustment factor when calculating WOE

        Returns
        -------
        Dict of IVAttributes object
            {'x1': attr_obj,
             'x2': attr_obj
             ...
             }
        """
        if isinstance(result_counts, dict):
            for col_name, data_event_count in result_counts.items():
                col_result_obj = self.woe_1d(data_event_count,
                                             adjustment_factor)
                assert isinstance(col_result_obj, BinColResults)
                self.bin_results.put_col_results(col_name, col_result_obj)
        else:
            woe_1d = functools.partial(self.woe_1d,
                                       adjustment_factor=adjustment_factor)
            col_result_obj_dict = dict(
                result_counts.mapValues(woe_1d).collect())
            # col_result_obj = self.woe_1d(data_event_count, adjustment_factor)
            for col_name, col_result_obj in col_result_obj_dict.items():
                assert isinstance(col_result_obj, BinColResults)
                self.bin_results.put_col_results(col_name, col_result_obj)

    @staticmethod
    def add_label_in_partition(data_bin_with_table, sparse_bin_points):
        """
        Add all label, so that become convenient to calculate woe and iv

        Parameters
        ----------
        data_bin_with_table : DTable
            The input data, the DTable is like:
            (id, {'x1': 1, 'x2': 5, 'x3': 2}, y)

        sparse_bin_points: dict
            Dict of sparse bin num
                {0: 2, 1: 3, 2:5 ... }

        Returns
        -------
        result_sum: the result DTable. It is like:
            {'x1': [[event_count, total_num], [event_count, total_num] ... ],
             'x2': [[event_count, total_num], [event_count, total_num] ... ],
             ...
            }

        """
        result_sum = {}
        for _, datas in data_bin_with_table:
            bin_idx_dict = datas[0]
            y = datas[1]

            # y = y_combo[0]
            # inverse_y = y_combo[1]
            for col_name, bin_idx in bin_idx_dict.items():
                result_sum.setdefault(col_name, [])
                col_sum = result_sum[col_name]
                while bin_idx >= len(col_sum):
                    col_sum.append([0, 0])
                if bin_idx == sparse_bin_points[col_name]:
                    continue
                label_sum = col_sum[bin_idx]
                label_sum[0] = label_sum[0] + y
                label_sum[1] = label_sum[1] + 1
                col_sum[bin_idx] = label_sum
                result_sum[col_name] = col_sum

        return list(result_sum.items())

    def shuffle_static_counts(self, statistic_counts):
        """
        Shuffle bin orders, and stored orders in self.bin_results

        Parameters
        ----------
        statistic_counts :  dict.
            It is like:
                {'x1': [[event_count, non_event_count], [event_count, non_event_count] ... ],
                 'x2': [[event_count, non_event_count], [event_count, non_event_count] ... ],
                 ...
                }

        Returns
        -------
        Shuffled result
        """
        result = {}
        for col_name, count_sum in statistic_counts.items():
            this_bin_result = self.bin_results.all_cols_results.get(
                col_name, BinColResults())
            shuffled_index = [x for x in range(len(count_sum))]
            random.shuffle(shuffled_index)
            result[col_name] = [count_sum[i] for i in shuffled_index]
            this_bin_result.bin_anonymous = [
                "bin_" + str(i) for i in shuffled_index
            ]
            self.bin_results.all_cols_results[col_name] = this_bin_result
        return result

    @staticmethod
    def aggregate_partition_label(sum1, sum2):
        """
        Used in reduce function. Aggregate the result calculate from each partition.

        Parameters
        ----------
        sum1 :  list.
            It is like:
                [[event_count, total_num], [event_count, total_num] ... ]
        sum2 : list
            Same as sum1
        Returns
        -------
        Merged sum. The format is same as sum1.

        """
        if sum1 is None and sum2 is None:
            return None

        if sum1 is None:
            return sum2

        if sum2 is None:
            return sum1

        for idx, label_sum2 in enumerate(sum2):
            if idx >= len(sum1):
                sum1.append(label_sum2)
            else:
                label_sum1 = sum1[idx]
                tmp = [
                    label_sum1[0] + label_sum2[0],
                    label_sum1[1] + label_sum2[1]
                ]
                sum1[idx] = tmp

        return sum1
예제 #4
0
class BaseFeatureBinning(ModelBase):
    """
    Do binning method through guest and host

    """
    def __init__(self):
        super(BaseFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.binning_obj: BaseBinning = None
        self.header = None
        self.header_anonymous = None
        self.schema = None
        self.host_results = []
        self.transform_type = None

        self.model_param = FeatureBinningParam()
        self.bin_inner_param = BinInnerParam()

    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(
            self.role, self.component_properties))
        self.binning_obj.set_role_party(
            self.role, self.component_properties.local_partyid)

    @staticmethod
    def data_format_transform(row):
        """
        transform data into sparse format
        """

        if type(row.features).__name__ != consts.SPARSE_VECTOR:
            feature_shape = row.features.shape[0]
            indices = []
            data = []

            for i in range(feature_shape):
                if np.isnan(row.features[i]):
                    indices.append(i)
                    data.append(NoneType())
                elif np.abs(row.features[i]) < consts.FLOAT_ZERO:
                    continue
                else:
                    indices.append(i)
                    data.append(row.features[i])

            new_row = copy.deepcopy(row)
            new_row.features = SparseVector(indices, data, feature_shape)
            return new_row
        else:
            sparse_vec = row.features.get_sparse_vector()
            replace_key = []
            for key in sparse_vec:
                if sparse_vec.get(key) == NoneType() or np.isnan(
                        sparse_vec.get(key)):
                    replace_key.append(key)

            if len(replace_key) == 0:
                return row
            else:
                new_row = copy.deepcopy(row)
                new_sparse_vec = new_row.features.get_sparse_vector()
                for key in replace_key:
                    new_sparse_vec[key] = NoneType()
                return new_row

    def _setup_bin_inner_param(self, data_instances, params):
        if self.schema is not None:
            return

        self.header = get_header(data_instances)
        LOGGER.debug("_setup_bin_inner_param, get header length: {}".format(
            len(self.header)))

        self.schema = data_instances.schema
        self.bin_inner_param.set_header(self.header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                params.transform_param.transform_names)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

    @assert_io_num_rows_equal
    @assert_schema_consistent
    def transform(self, data_instances):
        self._setup_bin_inner_param(data_instances, self.model_param)
        data_instances = self.binning_obj.transform(data_instances,
                                                    self.transform_type)
        self.set_schema(data_instances)
        self.data_output = data_instances
        return data_instances

    def _get_meta(self):
        # col_list = [str(x) for x in self.cols]

        transform_param = feature_binning_meta_pb2.TransformMeta(
            transform_cols=self.bin_inner_param.transform_bin_indexes,
            transform_type=self.model_param.transform_param.transform_type)

        meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta(
            method=self.model_param.method,
            compress_thres=self.model_param.compress_thres,
            head_size=self.model_param.head_size,
            error=self.model_param.error,
            bin_num=self.model_param.bin_num,
            cols=self.bin_inner_param.bin_names,
            adjustment_factor=self.model_param.adjustment_factor,
            local_only=self.model_param.local_only,
            need_run=self.need_run,
            transform_param=transform_param,
            skip_static=self.model_param.skip_static)
        return meta_protobuf_obj

    def _get_param(self):
        binning_result_obj = self.binning_obj.bin_results.generated_pb()
        # binning_result_obj = self.bin_results.generated_pb()
        host_results = [
            x.bin_results.generated_pb() for x in self.host_results
        ]
        result_obj = feature_binning_param_pb2. \
            FeatureBinningParam(binning_result=binning_result_obj,
                                host_results=host_results,
                                header=self.header,
                                header_anonymous=self.header_anonymous,
                                model_name=consts.BINNING_MODEL)

        return result_obj

    def load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()

        assert isinstance(model_meta,
                          feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param,
                          feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(
            list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        self.binning_obj.set_role_party(
            self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)
        self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        for host_pb in model_param.host_results:
            host_bin_obj = BaseBinning()
            host_bin_obj.bin_results.reconstruct(host_pb)
            self.host_results.append(host_bin_obj)

    def export_model(self):
        if self.model_output is not None:
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj}
        self.model_output = result
        return result

    def save_data(self):
        return self.data_output

    def set_schema(self, data_instance):
        self.schema['header'] = self.header
        data_instance.schema = self.schema
        LOGGER.debug(
            "After Binning, when setting schema, schema is : {}".format(
                data_instance.schema))

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        self.check_schema_content(data_instances.schema)
예제 #5
0
class BaseFeatureBinning(ModelBase):
    """
    Do binning method through guest and host

    """
    def __init__(self):
        super(BaseFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.binning_obj: BaseBinning = None
        self.header = None
        self.header_anonymous = None
        self.schema = None
        self.host_results = []
        self.transform_type = None

        self.model_param = FeatureBinningParam()
        self.bin_inner_param = BinInnerParam()
        self.bin_result = MultiClassBinResult(labels=[0, 1])
        self.has_missing_value = False
        self.labels = []

    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.role == consts.HOST:
            if self.transform_type == "woe":
                raise ValueError(
                    "Host party do not support woe transform now.")

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))

        self.iv_calculator = IvCalculator(
            self.model_param.adjustment_factor,
            role=self.role,
            party_id=self.component_properties.local_partyid)
        # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)

    @staticmethod
    def data_format_transform(row):
        """
        transform data into sparse format
        """

        if type(row.features).__name__ != consts.SPARSE_VECTOR:
            feature_shape = row.features.shape[0]
            indices = []
            data = []

            for i in range(feature_shape):
                if np.isnan(row.features[i]):
                    indices.append(i)
                    data.append(NoneType())
                elif np.abs(row.features[i]) < consts.FLOAT_ZERO:
                    continue
                else:
                    indices.append(i)
                    data.append(row.features[i])

            new_row = copy.deepcopy(row)
            new_row.features = SparseVector(indices, data, feature_shape)
            return new_row
        else:
            sparse_vec = row.features.get_sparse_vector()
            replace_key = []
            for key in sparse_vec:
                if sparse_vec.get(key) == NoneType() or np.isnan(
                        sparse_vec.get(key)):
                    replace_key.append(key)

            if len(replace_key) == 0:
                return row
            else:
                new_row = copy.deepcopy(row)
                new_sparse_vec = new_row.features.get_sparse_vector()
                for key in replace_key:
                    new_sparse_vec[key] = NoneType()
                return new_row

    def _setup_bin_inner_param(self, data_instances, params):
        if self.schema is not None:
            return

        self.header = get_header(data_instances)
        LOGGER.debug("_setup_bin_inner_param, get header length: {}".format(
            len(self.header)))

        self.schema = data_instances.schema
        self.bin_inner_param.set_header(self.header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                params.transform_param.transform_names)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

    @assert_io_num_rows_equal
    @assert_schema_consistent
    def transform(self, data_instances):
        self._setup_bin_inner_param(data_instances, self.model_param)
        if self.transform_type != "woe":
            data_instances = self.binning_obj.transform(
                data_instances, self.transform_type)
        elif self.role == consts.HOST:
            raise ValueError(
                "Woe transform is not available for host parties.")
        else:
            data_instances = self.iv_calculator.woe_transformer(
                data_instances, self.bin_inner_param, self.bin_result)
        self.set_schema(data_instances)
        self.data_output = data_instances
        return data_instances

    def _get_meta(self):
        # col_list = [str(x) for x in self.cols]

        transform_param = feature_binning_meta_pb2.TransformMeta(
            transform_cols=self.bin_inner_param.transform_bin_indexes,
            transform_type=self.model_param.transform_param.transform_type)

        meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta(
            method=self.model_param.method,
            compress_thres=self.model_param.compress_thres,
            head_size=self.model_param.head_size,
            error=self.model_param.error,
            bin_num=self.model_param.bin_num,
            cols=self.bin_inner_param.bin_names,
            adjustment_factor=self.model_param.adjustment_factor,
            local_only=self.model_param.local_only,
            need_run=self.need_run,
            transform_param=transform_param,
            skip_static=self.model_param.skip_static)
        return meta_protobuf_obj

    def _get_param(self):
        split_points_result = self.binning_obj.bin_results.split_results

        multi_class_result = self.bin_result.generated_pb_list(
            split_points_result)
        # LOGGER.debug(f"split_points_result: {split_points_result}")
        host_multi_class_result = []
        host_single_results = []
        for host_res in self.host_results:
            host_multi_class_result.extend(host_res.generated_pb_list())
            host_single_results.append(host_res.bin_results[0].generated_pb())

        has_host_result = True if len(host_multi_class_result) else False
        multi_pb = feature_binning_param_pb2.MultiClassResult(
            results=multi_class_result,
            labels=[str(x) for x in self.labels],
            host_results=host_multi_class_result,
            host_party_ids=[
                str(x) for x in self.component_properties.host_party_idlist
            ],
            has_host_result=has_host_result)
        result_obj = feature_binning_param_pb2. \
            FeatureBinningParam(binning_result=multi_class_result[0],
                                host_results=host_single_results,
                                header=self.header,
                                header_anonymous=self.header_anonymous,
                                model_name=consts.BINNING_MODEL,
                                multi_class_result=multi_pb)
        return result_obj

    def load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()
        multi_class_result = model_param.multi_class_result
        self.labels = list(multi_class_result.labels)
        # if not self.labels:
        #     self.labels = [0, 1]
        if self.labels:
            self.bin_result = MultiClassBinResult.reconstruct(
                list(multi_class_result.results), self.labels)

        assert isinstance(model_meta,
                          feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param,
                          feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(
            list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        elif bin_method == consts.OPTIMAL:
            self.binning_obj = OptimalBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

        split_results = dict(model_param.binning_result.binning_result)
        for col_name, sr_pb in split_results.items():
            split_points = list(sr_pb.split_points)
            self.binning_obj.bin_results.put_col_split_points(
                col_name, split_points)

        # self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        host_pbs = list(model_param.multi_class_result.host_results)
        if len(host_pbs):
            if len(self.labels) == 2:
                for host_pb in host_pbs:
                    self.host_results.append(
                        MultiClassBinResult.reconstruct(host_pb, self.labels))
            else:
                assert len(host_pbs) % len(self.labels) == 0
                i = 0
                while i < len(host_pbs):
                    this_pbs = host_pbs[i:i + len(self.labels)]
                    self.host_results.append(
                        MultiClassBinResult.reconstruct(this_pbs, self.labels))
                    i += len(self.labels)

        if list(model_param.header_anonymous):
            self.header_anonymous = list(model_param.header_anonymous)

    def export_model(self):
        if self.model_output is not None:
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj}
        self.model_output = result
        return result

    def save_data(self):
        return self.data_output

    def set_schema(self, data_instance):
        self.schema['header'] = self.header
        data_instance.schema = self.schema
        # LOGGER.debug("After Binning, when setting schema, schema is : {}".format(data_instance.schema))

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        self.check_schema_content(data_instances.schema)
예제 #6
0
class BaseHeteroFeatureBinning(ModelBase):
    """
    Do binning method through guest and host

    """

    def __init__(self):
        super(BaseHeteroFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.binning_obj = None
        self.header = None
        self.schema = None
        self.host_results = []
        self.transform_type = None

        self.model_param = FeatureBinningParam()
        self.bin_inner_param = BinInnerParam()

    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(self.role, self.component_properties))
        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)

    def _setup_bin_inner_param(self, data_instances, params: FeatureBinningParam):
        if self.schema is not None:
            return

        self.header = get_header(data_instances)
        self.schema = data_instances.schema
        self.bin_inner_param.set_header(self.header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(params.transform_param.transform_names)

        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

    def transform(self, data_instances):
        self._setup_bin_inner_param(data_instances, self.model_param)
        data_instances = self.binning_obj.transform(data_instances, self.transform_type)
        self.set_schema(data_instances)
        self.data_output = data_instances
        return data_instances

    def _get_meta(self):
        # col_list = [str(x) for x in self.cols]

        transform_param = feature_binning_meta_pb2.TransformMeta(
            transform_cols=self.bin_inner_param.transform_bin_indexes,
            transform_type=self.model_param.transform_param.transform_type
        )

        meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta(
            method=self.model_param.method,
            compress_thres=self.model_param.compress_thres,
            head_size=self.model_param.head_size,
            error=self.model_param.error,
            bin_num=self.model_param.bin_num,
            cols=self.bin_inner_param.bin_names,
            adjustment_factor=self.model_param.adjustment_factor,
            local_only=self.model_param.local_only,
            need_run=self.need_run,
            transform_param=transform_param
        )
        return meta_protobuf_obj

    def _get_param(self):
        binning_result_obj = self.binning_obj.bin_results.generated_pb()
        host_results = [x.bin_results.generated_pb() for x in self.host_results]

        result_obj = feature_binning_param_pb2.FeatureBinningParam(binning_result=binning_result_obj,
                                                                   host_results=host_results,
                                                                   header=self.header)
        # json_result = json_format.MessageToJson(result_obj)
        # LOGGER.debug("json_result: {}".format(json_result))
        return result_obj

    def load_model(self, model_dict):
        model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()

        assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)
        self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        for host_pb in model_param.host_results:
            host_bin_obj = HostBaseBinning()
            host_bin_obj.bin_results.reconstruct(host_pb)
            self.host_results.append(host_bin_obj)

    def export_model(self):
        if self.model_output is not None:
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {
            MODEL_META_NAME: meta_obj,
            MODEL_PARAM_NAME: param_obj
        }
        self.model_output = result
        return result

    def save_data(self):
        return self.data_output

    def set_schema(self, data_instance):
        self.schema['header'] = self.header
        data_instance.schema = self.schema
        LOGGER.debug("After Binning, when setting schema, schema is : {}".format(data_instance.schema))

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)