def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) # self._parse_cols(data_instances) self._setup_bin_inner_param(data_instances, self.model_param) self.binning_obj.fit_split_points(data_instances) if self.model_param.skip_static: self.transform(data_instances) return self.data_output label_counts = data_overview.get_label_count(data_instances) if len(label_counts) > 2: raise ValueError("Iv calculation support binary-data only in this version.") data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) if self.model_param.local_only: LOGGER.info("This is a local only binning fit") self.binning_obj.cal_local_iv(data_instances, label_table=label_table, label_counts=label_counts) self.transform(data_instances) self.set_summary(self.binning_obj.bin_results.summary()) return self.data_output if self.model_param.encrypt_param.method == consts.PAILLIER: cipher = PaillierEncrypt() cipher.generate_key(self.model_param.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yet") # from federatedml.secureprotol.encrypt import FakeEncrypt # cipher = FakeEncrypt() f = functools.partial(self.encrypt, cipher=cipher) encrypted_label_table = label_table.mapValues(f) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=-1) LOGGER.info("Sent encrypted_label_table to host") self.binning_obj.cal_local_iv(data_instances, label_table=label_table, label_counts=label_counts) encrypted_bin_sum_infos = self.transfer_variable.encrypted_bin_sum.get(idx=-1) encrypted_bin_infos = self.transfer_variable.optimal_info.get(idx=-1) total_summary = self.binning_obj.bin_results.summary() LOGGER.info("Get encrypted_bin_sum from host") for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos): host_party_id = self.component_properties.host_party_idlist[host_idx] encrypted_bin_sum = encrypted_bin_sum_infos[host_idx] result_counts = self.cipher_decompress(encrypted_bin_sum, cipher) host_bin_methods = encrypted_bin_info['bin_method'] category_names = encrypted_bin_info['category_names'] if host_bin_methods == consts.OPTIMAL: optimal_binning_params = encrypted_bin_info['optimal_params'] host_model_params = copy.deepcopy(self.model_param) host_model_params.bin_num = optimal_binning_params.get('bin_num') host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get('metric_method') host_model_params.optimal_binning_param.mixture = optimal_binning_params.get('mixture') host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get('max_bin_pct') host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get('min_bin_pct') self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram(data_instances) result_counts = dict(result_counts.collect()) optimal_binning_cols = {x: y for x, y in result_counts.items() if x not in category_names} host_binning_obj = self.optimal_binning_sync(optimal_binning_cols, data_instances.count(), data_instances.partitions, host_idx, host_model_params) category_bins = {x: y for x, y in result_counts.items() if x in category_names} host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor) else: host_binning_obj = BaseBinning() host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor) host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id) total_summary = self._merge_summary(total_summary, host_binning_obj.bin_results.summary()) self.host_results.append(host_binning_obj) self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") total_summary['test'] = 'test' self.set_summary(total_summary) return self.data_output
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) # self._parse_cols(data_instances) self._setup_bin_inner_param(data_instances, self.model_param) self.binning_obj.fit_split_points(data_instances) label_counts = data_overview.count_labels(data_instances) if label_counts > 2: raise ValueError( "Iv calculation support binary-data only in this version.") data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) if self.model_param.local_only: LOGGER.info("This is a local only binning fit") self.binning_obj.cal_local_iv(data_instances, label_table=label_table) self.transform(data_instances) return self.data_output cipher = PaillierEncrypt() cipher.generate_key() f = functools.partial(self.encrypt, cipher=cipher) encrypted_label_table = label_table.mapValues(f) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=-1) LOGGER.info("Sent encrypted_label_table to host") self.binning_obj.cal_local_iv(data_instances, label_table=label_table) encrypted_bin_infos = self.transfer_variable.encrypted_bin_sum.get( idx=-1) # LOGGER.debug("encrypted_bin_sums: {}".format(encrypted_bin_sums)) LOGGER.info("Get encrypted_bin_sum from host") for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos): host_party_id = self.component_properties.host_party_idlist[ host_idx] encrypted_bin_sum = encrypted_bin_info['encrypted_bin_sum'] host_bin_methods = encrypted_bin_info['bin_method'] category_names = encrypted_bin_info['category_names'] result_counts = self.__decrypt_bin_sum(encrypted_bin_sum, cipher) LOGGER.debug( "Received host {} result, length of buckets: {}".format( host_idx, len(result_counts))) LOGGER.debug("category_name: {}, host_bin_methods: {}".format( category_names, host_bin_methods)) # if self.model_param.method == consts.OPTIMAL: if host_bin_methods == consts.OPTIMAL: optimal_binning_params = encrypted_bin_info['optimal_params'] host_model_params = copy.deepcopy(self.model_param) host_model_params.bin_num = optimal_binning_params.get( 'bin_num') host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get( 'metric_method') host_model_params.optimal_binning_param.mixture = optimal_binning_params.get( 'mixture') host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get( 'max_bin_pct') host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get( 'min_bin_pct') self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram( data_instances) optimal_binning_cols = { x: y for x, y in result_counts.items() if x not in category_names } host_binning_obj = self.optimal_binning_sync( optimal_binning_cols, data_instances.count(), data_instances._partitions, host_idx, host_model_params) category_bins = { x: y for x, y in result_counts.items() if x in category_names } host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor) else: host_binning_obj = BaseBinning() host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor) host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id) self.host_results.append(host_binning_obj) self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") return self.data_output