def fit(self, data): self.__init_intersect_method() if self.model_param.repeated_id_process: if self.model_param.intersect_cache_param.use_cache is True and self.model_param.intersect_method == consts.RSA: raise ValueError( "Not support cache module while repeated id process.") if len( self.host_party_id_list ) > 1 and self.model_param.repeated_id_owner != consts.GUEST: raise ValueError( "While multi-host, repeated_id_owner should be guest.") proc_obj = RepeatedIDIntersect( repeated_id_owner=self.model_param.repeated_id_owner, role=self.role) data = proc_obj.run(data=data) if self.model_param.allow_info_share: if self.model_param.intersect_method == consts.RSA and self.model_param.info_owner == consts.GUEST \ or self.model_param.intersect_method == consts.RAW and self.model_param.join_role == self.model_param.info_owner: self.model_param.sync_intersect_ids = False self.intersect_ids = self.intersection_obj.run(data) if self.model_param.allow_info_share: self.intersect_ids = self.__share_info(data) LOGGER.info("Finish intersection") if self.intersect_ids: self.intersect_num = self.intersect_ids.count() self.intersect_rate = self.intersect_num * 1.0 / data.count() self.set_summary(self.get_model_summary()) self.callback_metric(metric_name=self.metric_name, metric_namespace=self.metric_namespace, metric_data=[ Metric("intersect_count", self.intersect_num), Metric("intersect_rate", self.intersect_rate) ]) self.tracker.set_metric_meta(metric_namespace=self.metric_namespace, metric_name=self.metric_name, metric_meta=MetricMeta( name=self.metric_name, metric_type=self.metric_type))
def fit(self, data): self.init_intersect_method() if self.model_param.repeated_id_process: if self.model_param.intersect_cache_param.use_cache is True and self.model_param.intersect_method == consts.RSA: raise ValueError( "Not support cache module while repeated id process.") if len( self.host_party_id_list ) > 1 and self.model_param.repeated_id_owner != consts.GUEST: raise ValueError( "While multi-host, repeated_id_owner should be guest.") proc_obj = RepeatedIDIntersect( repeated_id_owner=self.model_param.repeated_id_owner, role=self.role) if self.model_param.with_sample_id: proc_obj.use_sample_id() data = proc_obj.recover(data=data) self.intersect_ids = self.intersection_obj.run_intersect(data) if self.model_param.repeated_id_process: if not self.model_param.sync_intersect_ids: self.intersect_ids = data self.intersect_ids = proc_obj.expand(self.intersect_ids) if self.model_param.repeated_id_owner == self.role and self.model_param.only_output_key: sid_name = self.intersect_ids.schema.get('sid_name') self.intersect_ids = self.intersect_ids.mapValues( lambda v: None) self.intersect_ids.schema['sid_name'] = sid_name # LOGGER.info("repeated_id process:{}".format(self.intersect_ids.count())) if self.model_param.allow_info_share: if self.model_param.intersect_method == consts.RSA and self.model_param.info_owner == consts.GUEST \ or self.model_param.intersect_method == consts.RAW and self.model_param.join_role == self.model_param.info_owner: self.model_param.sync_intersect_ids = False self.intersect_ids = self.__share_info(self.intersect_ids) LOGGER.info("Finish intersection") if self.intersect_ids: self.intersect_num = self.intersect_ids.count() self.intersect_rate = self.intersect_num * 1.0 / data.count() self.set_summary(self.get_model_summary()) self.callback_metric(metric_name=self.metric_name, metric_namespace=self.metric_namespace, metric_data=[ Metric("intersect_count", self.intersect_num), Metric("intersect_rate", self.intersect_rate) ]) self.tracker.set_metric_meta(metric_namespace=self.metric_namespace, metric_name=self.metric_name, metric_meta=MetricMeta( name=self.metric_name, metric_type=self.metric_type))