示例#1
0
    def fit(self, data):
        self.__init_intersect_method()

        if self.model_param.repeated_id_process:
            if self.model_param.intersect_cache_param.use_cache is True and self.model_param.intersect_method == consts.RSA:
                raise ValueError(
                    "Not support cache module while repeated id process.")

            if len(
                    self.host_party_id_list
            ) > 1 and self.model_param.repeated_id_owner != consts.GUEST:
                raise ValueError(
                    "While multi-host, repeated_id_owner should be guest.")

            proc_obj = RepeatedIDIntersect(
                repeated_id_owner=self.model_param.repeated_id_owner,
                role=self.role)
            data = proc_obj.run(data=data)

        if self.model_param.allow_info_share:
            if self.model_param.intersect_method == consts.RSA and self.model_param.info_owner == consts.GUEST \
                    or self.model_param.intersect_method == consts.RAW and self.model_param.join_role == self.model_param.info_owner:
                self.model_param.sync_intersect_ids = False

        self.intersect_ids = self.intersection_obj.run(data)

        if self.model_param.allow_info_share:
            self.intersect_ids = self.__share_info(data)

        LOGGER.info("Finish intersection")

        if self.intersect_ids:
            self.intersect_num = self.intersect_ids.count()
            self.intersect_rate = self.intersect_num * 1.0 / data.count()

        self.set_summary(self.get_model_summary())

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=[
                                 Metric("intersect_count", self.intersect_num),
                                 Metric("intersect_rate", self.intersect_rate)
                             ])
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=MetricMeta(
                                         name=self.metric_name,
                                         metric_type=self.metric_type))
示例#2
0
    def fit(self, data):
        self.init_intersect_method()

        if self.model_param.repeated_id_process:
            if self.model_param.intersect_cache_param.use_cache is True and self.model_param.intersect_method == consts.RSA:
                raise ValueError(
                    "Not support cache module while repeated id process.")

            if len(
                    self.host_party_id_list
            ) > 1 and self.model_param.repeated_id_owner != consts.GUEST:
                raise ValueError(
                    "While multi-host, repeated_id_owner should be guest.")

            proc_obj = RepeatedIDIntersect(
                repeated_id_owner=self.model_param.repeated_id_owner,
                role=self.role)
            if self.model_param.with_sample_id:
                proc_obj.use_sample_id()
            data = proc_obj.recover(data=data)

        self.intersect_ids = self.intersection_obj.run_intersect(data)

        if self.model_param.repeated_id_process:
            if not self.model_param.sync_intersect_ids:
                self.intersect_ids = data

            self.intersect_ids = proc_obj.expand(self.intersect_ids)
            if self.model_param.repeated_id_owner == self.role and self.model_param.only_output_key:
                sid_name = self.intersect_ids.schema.get('sid_name')
                self.intersect_ids = self.intersect_ids.mapValues(
                    lambda v: None)
                self.intersect_ids.schema['sid_name'] = sid_name

            # LOGGER.info("repeated_id process:{}".format(self.intersect_ids.count()))

        if self.model_param.allow_info_share:
            if self.model_param.intersect_method == consts.RSA and self.model_param.info_owner == consts.GUEST \
                    or self.model_param.intersect_method == consts.RAW and self.model_param.join_role == self.model_param.info_owner:
                self.model_param.sync_intersect_ids = False

            self.intersect_ids = self.__share_info(self.intersect_ids)

        LOGGER.info("Finish intersection")

        if self.intersect_ids:
            self.intersect_num = self.intersect_ids.count()
            self.intersect_rate = self.intersect_num * 1.0 / data.count()

        self.set_summary(self.get_model_summary())

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=[
                                 Metric("intersect_count", self.intersect_num),
                                 Metric("intersect_rate", self.intersect_rate)
                             ])
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=MetricMeta(
                                         name=self.metric_name,
                                         metric_type=self.metric_type))