예제 #1
0
    def update_label_encoder(self, data):
        if self.label_encoder is not None:
            LOGGER.info(f"label encoder provided")
            if self.label_list is not None:
                LOGGER.info(f"label list provided")
                self.encoder_key_type = {
                    str(v): type(v).__name__
                    for v in self.label_list
                }

        else:
            data_type = data.schema.get("content_type")
            if data_type is None:
                label_count = get_label_count(data)
                labels = sorted(label_count.keys())
            # predict result
            else:
                labels = sorted(get_predict_result_labels(data))
            self.label_encoder = dict(zip(labels, range(len(labels))))

        if self.encoder_key_type is None:
            self.encoder_key_type = {
                str(k): type(k).__name__
                for k in self.label_encoder.keys()
            }
        self.encoder_value_type = {
            str(k): type(v).__name__
            for k, v in self.label_encoder.items()
        }

        self.label_encoder = {
            load_value_to_type(k, self.encoder_key_type[str(k)]): v
            for k, v in self.label_encoder.items()
        }
예제 #2
0
    def get_class_weight(data_instances):
        class_weight = get_label_count(data_instances)
        n_samples = data_instances.count()
        n_classes = len(class_weight.keys())
        res_class_weight = {str(k): n_samples / (n_classes * v) for k, v in class_weight.items()}

        return res_class_weight
예제 #3
0
    def test_bucket_binning(self):
        bin_param = FeatureBinningParam(bin_num=self.bin_num,
                                        bin_indexes=self.cols)
        bucket_bin = BucketBinning(bin_param)
        split_points = bucket_bin.fit_split_points(self.table)
        split_point = list(split_points.values())[0]
        for kth, s_p in enumerate(split_point):
            expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1)
            self.assertEqual(s_p, expect_s_p)
        label_counts = data_overview.get_label_count(self.table)

        bucket_bin.cal_local_iv(self.table, label_counts=label_counts)
        for col_name, iv_attr in bucket_bin.bin_results.all_cols_results.items(
        ):
            # print('col_name: {}, iv: {}, woe_array: {}'.format(col_name, iv_attr.iv, iv_attr.woe_array))
            assert abs(iv_attr.iv - 0.00364386529386804) < 1e-6
예제 #4
0
    def cal_local_iv(self, data_instances, split_points,
                     labels=None, label_counts=None, bin_cols_map=None,
                     label_table=None):
        """
        data_bin_table : Table.

            Each element represent for the corresponding bin number this feature belongs to.
            e.g. it could be:
            [{'x1': 1, 'x2': 5, 'x3': 2}
            ...
             ]
        Returns:
            MultiClassBinResult object
        """
        header = data_instances.schema.get("header")
        if bin_cols_map is None:
            bin_cols_map = {name: idx for idx, name in enumerate(header)}
            bin_indexes = [idx for idx, _ in enumerate(header)]
        else:
            bin_indexes = []
            for h in header:
                if h in bin_cols_map:
                    bin_indexes.append(bin_cols_map[h])
        if label_counts is None:
            label_counts = data_overview.get_label_count(data_instances)
            labels = list(label_counts.keys())
            label_counts = [label_counts[k] for k in labels]

        data_bin_table = BaseBinning.get_data_bin(data_instances, split_points, bin_cols_map)
        sparse_bin_points = BaseBinning.get_sparse_bin(bin_indexes, split_points, header)
        sparse_bin_points = {header[k]: v for k, v in sparse_bin_points.items()}

        if label_table is None:
            label_table = self.convert_label(data_instances, labels)

        result_counts = self.cal_bin_label(data_bin_table, sparse_bin_points, label_table, label_counts)
        multi_bin_res = self.cal_iv_from_counts(result_counts, labels,
                                                role=self.role,
                                                party_id=self.party_id)
        for col_name, sp in split_points.items():
            multi_bin_res.put_col_split_points(col_name, sp)
        return multi_bin_res
예제 #5
0
    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)

        # self._parse_cols(data_instances)
        self._setup_bin_inner_param(data_instances, self.model_param)

        self.binning_obj.fit_split_points(data_instances)
        if self.model_param.skip_static:
            self.transform(data_instances)
            return self.data_output

        label_counts = data_overview.get_label_count(data_instances)
        if len(label_counts) > 2:
            raise ValueError("Iv calculation support binary-data only in this version.")

        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)
        label_table = data_instances.mapValues(lambda x: x.label)

        if self.model_param.local_only:
            LOGGER.info("This is a local only binning fit")
            self.binning_obj.cal_local_iv(data_instances, label_table=label_table,
                                          label_counts=label_counts)
            self.transform(data_instances)
            self.set_summary(self.binning_obj.bin_results.summary())
            return self.data_output

        if self.model_param.encrypt_param.method == consts.PAILLIER:
            cipher = PaillierEncrypt()
            cipher.generate_key(self.model_param.encrypt_param.key_length)
        else:
            raise NotImplementedError("encrypt method not supported yet")
        # from federatedml.secureprotol.encrypt import FakeEncrypt
        # cipher = FakeEncrypt()
        f = functools.partial(self.encrypt, cipher=cipher)
        encrypted_label_table = label_table.mapValues(f)

        self.transfer_variable.encrypted_label.remote(encrypted_label_table,
                                                      role=consts.HOST,
                                                      idx=-1)
        LOGGER.info("Sent encrypted_label_table to host")

        self.binning_obj.cal_local_iv(data_instances, label_table=label_table,
                                      label_counts=label_counts)

        encrypted_bin_sum_infos = self.transfer_variable.encrypted_bin_sum.get(idx=-1)
        encrypted_bin_infos = self.transfer_variable.optimal_info.get(idx=-1)
        total_summary = self.binning_obj.bin_results.summary()

        LOGGER.info("Get encrypted_bin_sum from host")
        for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos):
            host_party_id = self.component_properties.host_party_idlist[host_idx]
            encrypted_bin_sum = encrypted_bin_sum_infos[host_idx]
            result_counts = self.cipher_decompress(encrypted_bin_sum, cipher)

            host_bin_methods = encrypted_bin_info['bin_method']
            category_names = encrypted_bin_info['category_names']
            if host_bin_methods == consts.OPTIMAL:
                optimal_binning_params = encrypted_bin_info['optimal_params']

                host_model_params = copy.deepcopy(self.model_param)
                host_model_params.bin_num = optimal_binning_params.get('bin_num')
                host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get('metric_method')
                host_model_params.optimal_binning_param.mixture = optimal_binning_params.get('mixture')
                host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get('max_bin_pct')
                host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get('min_bin_pct')

                self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram(data_instances)
                result_counts = dict(result_counts.collect())
                optimal_binning_cols = {x: y for x, y in result_counts.items() if x not in category_names}
                host_binning_obj = self.optimal_binning_sync(optimal_binning_cols, data_instances.count(),
                                                             data_instances.partitions,
                                                             host_idx, host_model_params)
                category_bins = {x: y for x, y in result_counts.items() if x in category_names}
                host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor)
            else:
                host_binning_obj = BaseBinning()
                host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor)
            host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id)
            total_summary = self._merge_summary(total_summary,
                                                host_binning_obj.bin_results.summary())
            self.host_results.append(host_binning_obj)

        self.set_schema(data_instances)
        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        total_summary['test'] = 'test'
        self.set_summary(total_summary)
        return self.data_output
예제 #6
0
 def statistic_label(data_instances):
     label_counts = data_overview.get_label_count(data_instances)
     label_elements = list(label_counts.keys())
     label_counts = [label_counts[k] for k in label_elements]
     return label_elements, label_counts
예제 #7
0
    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)

        # self._parse_cols(data_instances)

        self._setup_bin_inner_param(data_instances, self.model_param)

        if self.model_param.method == consts.OPTIMAL:
            has_missing_value = self.iv_calculator.check_containing_missing_value(data_instances)
            for idx in self.bin_inner_param.bin_indexes:
                if idx in has_missing_value:
                    raise ValueError(f"Optimal Binning do not support missing value now.")
        split_points = self.binning_obj.fit_split_points(data_instances)

        if self.model_param.skip_static:
            self.transform(data_instances)
            return self.data_output

        label_counts_dict = data_overview.get_label_count(data_instances)

        if len(label_counts_dict) > 2:
            if self.model_param.method == consts.OPTIMAL:
                raise ValueError("Have not supported optimal binning in multi-class data yet")

        self.labels = list(label_counts_dict.keys())
        label_counts = [label_counts_dict[k] for k in self.labels]
        label_table = IvCalculator.convert_label(data_instances, self.labels)
        self.bin_result = self.iv_calculator.cal_local_iv(data_instances=data_instances,
                                                          split_points=split_points,
                                                          labels=self.labels,
                                                          label_counts=label_counts,
                                                          bin_cols_map=self.bin_inner_param.get_need_cal_iv_cols_map(),
                                                          label_table=label_table)

        if self.model_param.local_only:

            self.transform(data_instances)
            self.set_summary(self.bin_result.summary())
            return self.data_output

        if self.model_param.encrypt_param.method == consts.PAILLIER:
            paillier_encryptor = PaillierEncrypt()
            paillier_encryptor.generate_key(self.model_param.encrypt_param.key_length)
            cipher = EncryptModeCalculator(encrypter=paillier_encryptor)
        else:
            raise NotImplementedError("encrypt method not supported yet")
        self._packer = GuestIntegerPacker(pack_num=len(self.labels), pack_num_range=label_counts,
                                          encrypt_mode_calculator=cipher)

        self.federated_iv(data_instances=data_instances, label_table=label_table,
                          cipher=cipher, result_counts=label_counts_dict, label_elements=self.labels)

        total_summary = self.bin_result.summary()
        for host_res in self.host_results:
            total_summary = self._merge_summary(total_summary, host_res.summary())

        self.set_schema(data_instances)
        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        self.set_summary(total_summary)
        return self.data_output