def _initialize_model(self, runtime_conf_path):
     binning_param = FeatureBinningParam()
     self.binning_param = ParamExtract.parse_param_from_config(
         binning_param, runtime_conf_path)
     FeatureBinningParamChecker.check_param(self.binning_param)
     self.model = HeteroFeatureBinningGuest(self.binning_param)
     LOGGER.debug("Guest model started")
class HeteroBinningGuestWorkflow(WorkFlow):
    def _initialize(self, config_path):
        self._initialize_role_and_mode()
        self._initialize_model(config_path)
        self._initialize_workflow_param(config_path)

    def _initialize_role_and_mode(self):
        self.role = consts.GUEST
        self.mode = consts.HETERO

    def _initialize_intersect(self, config):
        pass

    def _initialize_model(self, runtime_conf_path):
        binning_param = FeatureBinningParam()
        self.binning_param = ParamExtract.parse_param_from_config(
            binning_param, runtime_conf_path)
        FeatureBinningParamChecker.check_param(self.binning_param)
        self.model = HeteroFeatureBinningGuest(self.binning_param)
        LOGGER.debug("Guest model started")

    def save_binning_result(self):

        meta_table = self.model.save_model(self.workflow_param.model_table,
                                           self.workflow_param.model_namespace)
        return meta_table

    @status_tracer_decorator.status_trace
    def run(self):
        self._init_argument()

        if self.workflow_param.method == "binning":

            if self.binning_param.process_method == 'fit':
                train_data_instance = self.gen_data_instance(
                    self.workflow_param.train_input_table,
                    self.workflow_param.train_input_namespace,
                    mode='fit')
                if self.binning_param.local_only:
                    self.model.fit_local(train_data_instance)
                else:
                    self.model.fit(train_data_instance)
                self.save_binning_result()
            else:
                train_data_instance = self.gen_data_instance(
                    self.workflow_param.train_input_table,
                    self.workflow_param.train_input_namespace,
                    mode='transform')
                self.load_model()

                if self.binning_param.local_only:
                    self.model.transform_local(train_data_instance)
                else:
                    self.model.transform(train_data_instance)
                self.save_binning_result()
        else:
            raise TypeError("method %s is not support yet" %
                            (self.workflow_param.method))

        LOGGER.info("Task end")
Exemplo n.º 3
0
    def run_data(self, table_args, run_type='fit'):
        if self.binning_obj is not None:
            return self.binning_obj
        if self.role == GUEST:
            binning_obj = HeteroFeatureBinningGuest()
        else:
            binning_obj = HeteroFeatureBinningHost()
        guest_param = self._make_param_dict(run_type)

        binning_obj.run(guest_param, table_args)
        self.binning_obj = binning_obj
        return binning_obj
Exemplo n.º 4
0
    def init_previous_model(self, **models):
        if 'binning_model' in models:
            binning_model_params = models.get('binning_model')
            binning_param = FeatureBinningParam()
            if self.party_name == consts.GUEST:
                binning_obj = HeteroFeatureBinningGuest(binning_param)
            else:
                binning_obj = HeteroFeatureBinningHost(binning_param)

            name = binning_model_params.get('name')
            namespace = binning_model_params.get('namespace')

            binning_obj.load_model(name, namespace)
            self.binning_model = binning_obj
    def run_data(self, table_args, run_type='fit'):
        if self.binning_obj is not None:
            return self.binning_obj
        if self.role == GUEST:
            binning_obj = HeteroFeatureBinningGuest()
        else:
            binning_obj = HeteroFeatureBinningHost()

        # param_obj = FeatureBinningParam(method=consts.QUANTILE)
        # binning_obj.model_param = param_obj
        guest_param = self._make_param_dict(run_type)
        binning_obj.run(guest_param, table_args)
        print("current binning method: {}, split_points: {}".format(
            binning_obj.model_param.method,
            binning_obj.binning_obj.split_points))
        self.binning_obj = binning_obj
        return binning_obj
    def load_model(self, model_dict):

        if 'model' in model_dict:
            # self._parse_need_run(model_dict, MODEL_META_NAME)
            LOGGER.debug("Feature selection need run: {}".format(
                self.need_run))
            if not self.need_run:
                return
            model_param = list(
                model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
            model_meta = list(
                model_dict.get('model').values())[0].get(MODEL_META_NAME)

            self.model_output = {
                MODEL_META_NAME: model_meta,
                MODEL_PARAM_NAME: model_param
            }

            header = list(model_param.header)
            # self.schema = {'header': header}
            self.header = header
            self.curt_select_properties.set_header(header)
            self.completed_selection_result.set_header(header)
            self.curt_select_properties.set_last_left_col_indexes(
                [x for x in range(len(header))])
            self.curt_select_properties.add_select_col_names(header)

            final_left_cols_names = dict(model_param.final_left_cols.left_cols)
            LOGGER.debug(
                "final_left_cols_names: {}".format(final_left_cols_names))
            for col_name, _ in final_left_cols_names.items():
                self.curt_select_properties.add_left_col_name(col_name)
            self.completed_selection_result.add_filter_results(
                filter_name='conclusion',
                select_properties=self.curt_select_properties)
            self.update_curt_select_param()
            LOGGER.debug(
                "After load model, completed_selection_result.all_left_col_indexes: {}"
                .format(self.completed_selection_result.all_left_col_indexes))

        if 'isometric_model' in model_dict:

            LOGGER.debug(
                "Has isometric_model, model_dict: {}".format(model_dict))
            if self.party_name == consts.GUEST:
                self.binning_model = HeteroFeatureBinningGuest()
            else:
                self.binning_model = HeteroFeatureBinningHost()

            new_model_dict = {'model': model_dict['isometric_model']}
            self.binning_model.load_model(new_model_dict)
Exemplo n.º 7
0
    def _calculates_iv_attrs(self, data_instances, flowid_postfix=''):
        if self.local_only and self.guest_iv_attrs is not None:
            return

        bin_flow_id = self.flowid + flowid_postfix
        self.bin_param.cols = self.left_cols
        if self.binning_model is None:
            self.binning_model = HeteroFeatureBinningGuest(self.bin_param)
            self.binning_model.set_flowid(bin_flow_id)
        else:
            self.binning_model.reset(self.bin_param, flowid=bin_flow_id)

        if self.local_only:
            if self.guest_iv_attrs is None:
                self.guest_iv_attrs = self.binning_model.fit_local(
                    data_instances=data_instances)
        else:
            iv_attrs = self.binning_model.fit(data_instances)
            self.guest_iv_attrs = iv_attrs.get('local')
            self.host_iv_attrs = iv_attrs.get('remote')
            self.host_left_cols = [i for i in range(len(self.host_iv_attrs))]
            LOGGER.debug("Host left cols: {}".format(self.host_left_cols))
        LOGGER.info("Finish federated binning with host.")
Exemplo n.º 8
0
    def feature_binning(self, data_instances, flow_id='sample_flowid'):
        if self.mode == consts.H**O:
            LOGGER.info(
                "H**o feature selection is not supporting yet. Coming soon")
            return data_instances

        if data_instances is None:
            return data_instances

        LOGGER.info("Start feature binning")
        feature_binning_param = param_generator.FeatureBinningParam()
        feature_binning_param = ParamExtract.parse_param_from_config(
            feature_binning_param, self.config_path)
        param_checker.FeatureBinningParamChecker.check_param(
            feature_binning_param)

        if self.role == consts.HOST:
            feature_binning_obj = HeteroFeatureBinningHost(
                feature_binning_param)
        elif self.role == consts.GUEST:
            feature_binning_obj = HeteroFeatureBinningGuest(
                feature_binning_param)
        elif self.role == consts.ARBITER:
            return data_instances
        else:
            raise ValueError("Unknown role of workflow")

        feature_binning_obj.set_flowid(flow_id)
        if feature_binning_param.local_only:
            data_instances = feature_binning_obj.fit_local(data_instances)
        else:
            data_instances = feature_binning_obj.fit(data_instances)
        save_result = feature_binning_obj.save_model(
            self.workflow_param.model_table,
            self.workflow_param.model_namespace)
        # Save model result in pipeline
        for meta_buffer_type, param_buffer_type in save_result:
            self.pipeline.node_meta.append(meta_buffer_type)
            self.pipeline.node_param.append(param_buffer_type)

        LOGGER.info("Finish feature selection")
        return data_instances
Exemplo n.º 9
0
    def _load_model(self, model_dict):

        if 'model' in model_dict:
            # self._parse_need_run(model_dict, MODEL_META_NAME)
            LOGGER.debug("Feature selection need run: {}".format(self.need_run))
            if not self.need_run:
                return
            model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
            model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME)

            self.model_output = {
                MODEL_META_NAME: model_meta,
                MODEL_PARAM_NAME: model_param
            }
            LOGGER.debug("Model output set, model_output is :{}".format(self.model_output))
            self.results = list(model_param.results)
            left_col_obj = model_param.final_left_cols

            original_headers = list(left_col_obj.original_cols)
            self.header = original_headers
            left_col_name_dict = dict(left_col_obj.left_cols)
            LOGGER.debug("In load model, left_col_name_dict: {}, original_headers: {}".format(left_col_name_dict,
                                                                                              original_headers))
            left_cols = {}
            for col_name, is_left in left_col_name_dict.items():
                left_cols[col_name] = is_left
            LOGGER.debug("Self.left_cols: {}".format(left_cols))
            self.filter_result = SelfFilterResult(header=original_headers, to_select_cols_all=list(left_cols.keys()))
            self.filter_result.set_left_cols(left_cols)

        if 'isometric_model' in model_dict:

            LOGGER.debug("Has isometric_model, model_dict: {}".format(model_dict))
            if self.party_name == consts.GUEST:
                self.binning_model = HeteroFeatureBinningGuest()
            else:
                self.binning_model = HeteroFeatureBinningHost()

            new_model_dict = {'model': model_dict['isometric_model']}
            self.binning_model._load_model(new_model_dict)
class HeteroBinningGuestWorkflow(WorkFlow):
    def _initialize(self, config_path):
        self._initialize_role_and_mode()
        self._initialize_model(config_path)
        self._initialize_workflow_param(config_path)

    def _initialize_role_and_mode(self):
        self.role = consts.GUEST
        self.mode = consts.HETERO

    def _initialize_intersect(self, config):
        pass

    def _initialize_model(self, runtime_conf_path):
        binning_param = FeatureBinningParam()
        self.binning_param = ParamExtract.parse_param_from_config(binning_param, runtime_conf_path)
        FeatureBinningParamChecker.check_param(self.binning_param)
        self.model = HeteroFeatureBinningGuest(self.binning_param)
        LOGGER.debug("Guest model started")

    def save_binning_result(self):

        meta_table = self.model.save_model(self.workflow_param.model_table, self.workflow_param.model_namespace)
        return meta_table

    @status_tracer_decorator.status_trace
    def run(self):
        self._init_argument()

        if self.workflow_param.method == "binning":

            if self.binning_param.process_method == 'fit':
                train_data_instance = self.gen_data_instance(self.workflow_param.train_input_table,
                                                             self.workflow_param.train_input_namespace,
                                                             mode='fit')
                LOGGER.debug("After dataio, header is : {}".format(train_data_instance.schema))
                if self.binning_param.local_only:
                    self.model.fit_local(train_data_instance)
                else:
                    LOGGER.debug("Start model fit")
                    self.model.fit(train_data_instance)
                self.model.save_model(self.workflow_param.model_table, self.workflow_param.model_namespace)
                train_data_instance = self.one_hot_encoder_fit_transform(train_data_instance)

            else:
                train_data_instance = self.gen_data_instance(self.workflow_param.train_input_table,
                                                             self.workflow_param.train_input_namespace,
                                                             mode='transform')
                LOGGER.debug("After dataio, header is : {}".format(train_data_instance.schema))
                self.model.load_model(self.workflow_param.model_table, self.workflow_param.model_namespace)
                if self.binning_param.local_only:
                    self.model.transform_local(train_data_instance)
                else:
                    self.model.transform(train_data_instance)
                self.save_binning_result()
                train_data_instance = self.one_hot_encoder_transform(train_data_instance)
            self._show_data(train_data_instance)
        else:
            raise TypeError("method %s is not support yet" % (self.workflow_param.method))

        LOGGER.info("Task end")

    def _show_data(self, data_instances):
        local_data = data_instances.collect()
        LOGGER.debug("data header: {}".format(data_instances.schema))
        n = 0
        for k, v in local_data:
            LOGGER.debug("new data is :{}".format(v.features))
            n += 1
            if n >= 20:
                break
Exemplo n.º 11
0
    def test_feature_binning(self):
        binning_guest = HeteroFeatureBinningGuest()

        guest_param = self._make_param_dict('fit')

        binning_guest.run(guest_param, self.args)

        result_data = binning_guest.save_data()
        local_data = result_data.collect()
        print("data in fit")
        for k, v in local_data:
            print("k: {}, v: {}".format(k, v.features))
        guest_model = {self.model_name: binning_guest.export_model()}

        guest_args = {
            'data': {
                self.model_name: {
                    'data': self.table
                }
            },
            'model': guest_model
        }

        binning_guest = HeteroFeatureBinningGuest()

        guest_param = self._make_param_dict('transform')

        binning_guest.run(guest_param, guest_args)

        result_data = binning_guest.save_data()
        local_data = result_data.collect()
        print("data in transform")
        for k, v in local_data:
            print("k: {}, v: {}".format(k, v.features))
Exemplo n.º 12
0
class HeteroFeatureSelectionGuest(BaseHeteroFeatureSelection):
    def __init__(self, params):
        super(HeteroFeatureSelectionGuest, self).__init__(params)
        self.left_cols = None
        self.host_left_cols = None
        self.local_only = params.local_only
        self.guest_iv_attrs = None
        self.host_iv_attrs = None
        self.bin_param = self.params.bin_param
        self.static_obj = None
        self.send_times = 0
        self.binning_model = None
        self.results = []
        self.flowid = ''

    def fit(self, data_instances):
        self._abnormal_detection(data_instances)
        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]

        self._parse_cols(data_instances)
        self.left_cols = self.cols.copy()

        for method in self.filter_method:
            self.filter_one_method(data_instances, method)
            if len(self.left_cols) == 0:
                LOGGER.warning(
                    "After filter methods, none of feature left. Please check your filter parameters"
                )
                break

    def fit_local(self, data_instances):
        self._abnormal_detection(data_instances)
        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]

        feature_selection_obj = FeatureSelection(self.params)
        self.left_cols = feature_selection_obj.filter(data_instances)
        if self.cols == -1:
            self.cols = feature_selection_obj.select_cols

        self.left_cols = feature_selection_obj.filter(data_instances)
        self.results = feature_selection_obj.results

    def fit_local_transform(self, data_instances):
        self._abnormal_detection(data_instances)

        self._parse_cols(data_instances)
        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]
        self.fit_local(data_instances)
        new_data = self.transform(data_instances)
        new_data.schema['header'] = self.header

        return new_data

    def transform(self, data_instances):
        self._abnormal_detection(data_instances)

        self._parse_cols(data_instances)
        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]
        new_data = self._transfer_data(data_instances)
        new_data.schema['header'] = self.header

        return new_data

    def fit_transform(self, data_instances):
        self._abnormal_detection(data_instances)

        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]
        self.fit(data_instances)
        new_data = self.transform(data_instances)
        new_data.schema['header'] = self.header
        return new_data

    def filter_one_method(self, data_instances, method):

        if method == consts.IV_VALUE_THRES:
            self._calculates_iv_attrs(data_instances,
                                      flowid_postfix='iv_value')
            iv_param = self.params.iv_param
            iv_filter = feature_selection.IVValueSelectFilter(
                iv_param, self.left_cols, self.guest_iv_attrs)
            new_left_cols = iv_filter.filter()

            self.results.append(iv_filter.to_result())

            # Renew current left cols and iv_attrs
            new_iv_list = self._renew_iv_attrs(new_left_cols, self.left_cols,
                                               self.guest_iv_attrs)
            self.guest_iv_attrs = new_iv_list
            self.left_cols = new_left_cols

            if not self.local_only:
                self._filter_host_iv_value()
            LOGGER.info(
                "Finish iv value threshold filter. Current left cols are: {}".
                format(self.left_cols))

        if method == consts.IV_PERCENTILE:

            self._calculates_iv_attrs(data_instances,
                                      flowid_postfix='iv_percentile')
            iv_param = self.params.iv_param
            iv_filter = feature_selection.IVPercentileFilter(iv_param)
            iv_filter.add_attrs(self.guest_iv_attrs, self.left_cols)
            if not self.local_only:
                iv_filter.add_attrs(self.host_iv_attrs, self.host_left_cols)
            left_cols = iv_filter.filter_multiple_parties()
            new_left_cols = left_cols[0]
            self.results.append(iv_filter.to_result())

            # Renew current left cols and iv_attrs
            new_iv_list = self._renew_iv_attrs(new_left_cols, self.left_cols,
                                               self.guest_iv_attrs)
            self.guest_iv_attrs = new_iv_list
            self.left_cols = new_left_cols

            # If host has participated, send result to host
            if len(left_cols) > 1:
                new_host_left_cols = left_cols[1]
                new_host_iv_list = self._renew_iv_attrs(
                    new_host_left_cols, self.host_left_cols,
                    self.host_iv_attrs)
                self.host_iv_attrs = new_host_iv_list
                self.host_left_cols = new_host_left_cols
                self._send_host_result_cols()
            LOGGER.info(
                "Finish iv percentile filter. Current left cols are: {}".
                format(self.left_cols))

        if method == consts.COEFFICIENT_OF_VARIATION_VALUE_THRES:
            coe_param = self.params.coe_param
            coe_filter = feature_selection.CoeffOfVarValueFilter(
                coe_param, self.left_cols, self.static_obj)
            self.left_cols = coe_filter.filter(data_instances)
            self.static_obj = coe_filter.statics_obj
            self.results.append(coe_filter.to_result())

            LOGGER.info(
                "Finish coeffiecient_of_variation value threshold filter. Current left cols are: {}"
                .format(self.left_cols))

        if method == consts.UNIQUE_VALUE:
            unique_param = self.params.unique_param
            unique_filter = feature_selection.UniqueValueFilter(
                unique_param, self.left_cols, self.static_obj)
            self.left_cols = unique_filter.filter(data_instances)
            self.static_obj = unique_filter.statics_obj
            self.results.append(unique_filter.to_result())

            LOGGER.info(
                "Finish unique value filter. Current left cols are: {}".format(
                    self.left_cols))

        if method == consts.OUTLIER_COLS:
            outlier_param = self.params.outlier_param
            outlier_filter = feature_selection.OutlierFilter(
                outlier_param, self.left_cols)
            self.left_cols = outlier_filter.filter(data_instances)
            self.results.append(outlier_filter.to_result())
            LOGGER.info(
                "Finish outlier cols filter. Current left cols are: {}".format(
                    self.left_cols))

    def _transfer_data(self, data_instances):
        if self.left_cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            self.left_cols = [i for i in range(features_shape)]

        f = functools.partial(self.select_cols, left_cols=self.left_cols)

        new_data = data_instances.mapValues(f)
        self._reset_header()
        return new_data

    def _calculates_iv_attrs(self, data_instances, flowid_postfix=''):
        if self.local_only and self.guest_iv_attrs is not None:
            return

        bin_flow_id = self.flowid + flowid_postfix
        self.bin_param.cols = self.left_cols
        if self.binning_model is None:
            self.binning_model = HeteroFeatureBinningGuest(self.bin_param)
            self.binning_model.set_flowid(bin_flow_id)
        else:
            self.binning_model.reset(self.bin_param, flowid=bin_flow_id)

        if self.local_only:
            if self.guest_iv_attrs is None:
                self.guest_iv_attrs = self.binning_model.fit_local(
                    data_instances=data_instances)
        else:
            iv_attrs = self.binning_model.fit(data_instances)
            self.guest_iv_attrs = iv_attrs.get('local')
            self.host_iv_attrs = iv_attrs.get('remote')
            self.host_left_cols = [i for i in range(len(self.host_iv_attrs))]
            LOGGER.debug("Host left cols: {}".format(self.host_left_cols))
        LOGGER.info("Finish federated binning with host.")

    def _send_host_result_cols(self):
        result_cols_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.result_left_cols, self.send_times)
        federation.remote(self.host_left_cols,
                          name=self.transfer_variable.result_left_cols.name,
                          tag=result_cols_id,
                          role=consts.HOST,
                          idx=0)
        self.send_times += 1
        LOGGER.info(
            "Sent result cols from guest to host, result cols are: {}".format(
                self.host_left_cols))

    def _filter_host_iv_value(self):
        host_iv_thres_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.host_iv_threshold)
        host_iv_thres = federation.get(
            name=self.transfer_variable.host_iv_threshold.name,
            tag=host_iv_thres_id,
            idx=0)
        LOGGER.info("Received iv threshold from host, threshold is :{}".format(
            host_iv_thres))
        iv_param = IVSelectionParam(value_threshold=host_iv_thres)
        host_filter = feature_selection.IVValueSelectFilter(
            iv_param, self.host_left_cols, self.host_iv_attrs)
        new_host_left_cols = host_filter.filter()

        # Renew current host left cols and host iv_attrs
        self.host_iv_attrs = self._renew_iv_attrs(new_host_left_cols,
                                                  self.host_left_cols,
                                                  self.host_iv_attrs)
        self.host_left_cols = new_host_left_cols

        self._send_host_result_cols()

    def _renew_iv_attrs(self, new_left_cols, pre_left_cols, iv_attrs):
        new_iv_list = []
        for left_col in new_left_cols:
            idx = pre_left_cols.index(left_col)
            new_iv_list.append(iv_attrs[idx])
        return new_iv_list

    def _parse_cols(self, data_instances):
        if self.cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            self.cols = [i for i in range(features_shape)]