Пример #1
0
def client_align_labels(self, data_inst):
    local_labels = data_inst.map(lambda k, v: [k, {v.label}]).reduce(
        lambda x, y: x | y)
    _, self._label_align_mapping = HomoLabelEncoderClient().label_alignment(
        local_labels)
    num_classes = len(self._label_align_mapping)

    if self.config_type == "pytorch":
        for layer in reversed(self.nn_define):
            if layer["layer"] == "Linear":
                output_dim = layer["config"][1]
                if output_dim == 1 and num_classes == 2:
                    return
                layer["config"][1] = num_classes
                return

    if self.config_type == "nn":
        for layer in reversed(self.nn_define):
            if layer["layer"] == "Dense":
                output_dim = layer.get("units", None)
                if output_dim == 1 and num_classes == 2:
                    return
                layer["units"] = num_classes
                return

    if self.config_type == "keras":
        layers = self.nn_define["config"]["layers"]
        for layer in reversed(layers):
            if layer["class_name"] == "Dense":
                output_dim = layer["config"].get("units", None)
                if output_dim == 1 and num_classes == 2:
                    return
                layer["config"]["units"] = num_classes
                return
Пример #2
0
    def _client_check_data(self, data_instances):
        self._abnormal_detection(data_instances)
        self.check_abnormal_values(data_instances)
        self.init_schema(data_instances)

        num_classes, classes_ = ClassifyLabelChecker.validate_label(data_instances)
        aligned_label, new_label_mapping = HomoLabelEncoderClient().label_alignment(classes_)
        if len(aligned_label) > 2:
            raise ValueError("H**o LR support binary classification only now")
        elif len(aligned_label) <= 1:
            raise ValueError("Number of classes should be equal to 2")
Пример #3
0
    def __init__(
        self,
        data_instances: CTableABC,
        expected_label_type=np.float32,
        label_align_mapping=None,
        **kwargs,
    ):

        # partition
        self.partitions = data_instances.partitions

        # size
        self.size = data_instances.count()
        if self.size <= 0:
            raise ValueError("num of instances is 0")

        # alignment labels
        if label_align_mapping is None:
            labels = data_instances.applyPartitions(
                lambda it: {item[1].label
                            for item in it}).reduce(
                                lambda x, y: set.union(x, y))
            _, label_align_mapping = HomoLabelEncoderClient().label_alignment(
                labels)
            LOGGER.debug(f"label mapping: {label_align_mapping}")
        self._label_align_mapping = label_align_mapping

        # shape
        self.x_shape = data_instances.first()[1].features.shape
        self.x = np.zeros((self.size, *self.x_shape), dtype=np.float32)
        self.y = np.zeros((self.size, ), dtype=expected_label_type)
        self._keys = []

        index = 0
        for key, instance in data_instances.collect():
            self._keys.append(key)
            self.x[index] = instance.features
            self.y[index] = label_align_mapping[instance.label]
            index += 1

        self._num_labels = len(label_align_mapping)
        self._num_features = self.x_shape[0]
Пример #4
0
    def __init__(self,
                 root,
                 is_train=True,
                 expected_label_type=np.float32,
                 **kwargs):

        # fake alignment
        if is_train:
            HomoLabelEncoderClient().label_alignment(["fake"])

        # load data
        with open(os.path.join(root, "config.yaml")) as f:
            config = yaml.safe_load(f)
        if config["type"] == "vision":
            # read filenames
            with open(os.path.join(root, "filenames")) as f:
                file_names = [filename.strip() for filename in f]

            # read inputs
            if config["inputs"]["type"] != "images":
                raise TypeError("inputs type of vision type should be images")
            input_ext = config["inputs"]["ext"]
            self.images = [
                os.path.join(root, "images", f"{x}.{input_ext}")
                for x in file_names
            ]
            self._PIL_mode = config["inputs"].get("PIL_mode", "L")

            # read targets
            if config["targets"]["type"] == "images":
                target_ext = config["targets"]["ext"]
                self.targets = [
                    os.path.join(root, "targets", f"{x}.{target_ext}")
                    for x in file_names
                ]
                self.targets_is_image = True
            elif config["targets"]["type"] == "integer":
                with open(os.path.join(root, "targets")) as f:
                    targets_mapping = {}
                    for line in f:
                        filename, target = line.split(",")
                        targets_mapping[
                            filename.strip()] = expected_label_type(
                                target.strip())
                    self.targets = [
                        targets_mapping[filename] for filename in file_names
                    ]
                self.targets_is_image = False
            self._keys = file_names

        else:
            raise TypeError(f"{config['type']}")

        assert len(self.images) == len(self.targets)

        transform = torchvision.transforms.Compose([
            # torchvision.transforms.Resize((100, 100)),
            torchvision.transforms.ToTensor(),
        ])
        target_transform = None
        if self.targets_is_image:
            target_transform = transform

        super(VisionDataSet, self).__init__(
            root,
            transform=transform,
            target_transform=target_transform,
        )
Пример #5
0
    def fit(self, data_inst, validate_data=None):

        # binning
        data_inst = self.data_alignment(data_inst)
        self.data_bin, self.bin_split_points, self.bin_sparse_points = self.federated_binning(
            data_inst)

        # fid mapping
        self.feature_name_fid_mapping = self.gen_feature_fid_mapping(
            data_inst.schema)

        # set feature_num
        self.feature_num = self.bin_split_points.shape[0]

        # sync feature num
        self.sync_feature_num()

        # initialize validation strategy
        self.validation_strategy = self.init_validation_strategy(
            train_data=data_inst,
            validate_data=validate_data,
        )

        # check labels
        local_classes = self.check_label(self.data_bin)

        # sync label class and set y
        if self.task_type == consts.CLASSIFICATION:

            aligned_label, new_label_mapping = HomoLabelEncoderClient(
            ).label_alignment(local_classes)
            self.classes_ = aligned_label
            self.check_label_starts_from_zero(self.classes_)
            # set labels
            self.num_classes = len(new_label_mapping)
            LOGGER.info('aligned labels are {}, num_classes is {}'.format(
                aligned_label, self.num_classes))
            self.y = self.data_bin.mapValues(
                lambda instance: new_label_mapping[instance.label])
            # set tree dimension
            self.booster_dim = self.num_classes if self.num_classes > 2 else 1
        else:
            self.y = self.data_bin.mapValues(lambda instance: instance.label)

        # set loss function
        self.loss = self.get_loss_function()

        # set y_hat_val
        self.y_hat, self.init_score = self.get_init_score(
            self.y, self.num_classes)

        LOGGER.info('begin to fit a boosting tree')
        for epoch_idx in range(self.boosting_round):

            LOGGER.info('cur epoch idx is {}'.format(epoch_idx))

            for class_idx in range(self.booster_dim):

                # fit a booster
                model = self.fit_a_booster(epoch_idx, class_idx)
                booster_meta, booster_param = model.get_model()
                if booster_meta is not None and booster_param is not None:
                    self.booster_meta = booster_meta
                    self.boosting_model_list.append(booster_param)

                # update predict score
                cur_sample_weights = model.get_sample_weights()
                self.y_hat = self.get_new_predict_score(self.y_hat,
                                                        cur_sample_weights,
                                                        dim=class_idx)

            local_loss = self.compute_loss(self.y_hat, self.y)
            self.aggregator.send_local_loss(local_loss,
                                            self.data_bin.count(),
                                            suffix=(epoch_idx, ))

            if self.validation_strategy:
                self.validation_strategy.validate(self, epoch_idx)

            # check stop flag if n_iter_no_change is True
            if self.n_iter_no_change:
                should_stop = self.aggregator.get_converge_status(
                    suffix=(str(epoch_idx), ))
                if should_stop:
                    LOGGER.info('n_iter_no_change stop triggered')
                    break

        self.set_summary(self.generate_summary())
Пример #6
0
    def fit(self, data_inst, validate_data=None):

        # init federation obj
        self.aggregator = HomoBoostClientAggregator()
        self.binning_obj = HomoFeatureBinningClient()

        # binning
        self.data_preporcess(data_inst)

        # fid mapping and warm start check
        if not self.is_warm_start:
            self.feature_name_fid_mapping = self.gen_feature_fid_mapping(
                data_inst.schema)
        else:
            self.feat_name_check(data_inst, self.feature_name_fid_mapping)

        # set feature_num
        self.feature_num = self.bin_split_points.shape[0]

        # sync feature num
        self.sync_feature_num()

        # initialize validation strategy
        self.callback_list.on_train_begin(data_inst, validate_data)

        # check labels
        local_classes = self.check_label(self.data_bin)

        # set start round
        self.start_round = len(self.boosting_model_list) // self.booster_dim

        # sync label class and set y
        if self.task_type == consts.CLASSIFICATION:

            aligned_label, new_label_mapping = HomoLabelEncoderClient(
            ).label_alignment(local_classes)
            if self.is_warm_start:
                assert set(aligned_label) == set(self.classes_), 'warm start label alignment failed, differences: {}'. \
                    format(set(aligned_label).symmetric_difference(set(self.classes_)))
            self.classes_ = aligned_label
            self.check_label_starts_from_zero(self.classes_)
            # set labels
            self.num_classes = len(new_label_mapping)
            LOGGER.info('aligned labels are {}, num_classes is {}'.format(
                aligned_label, self.num_classes))
            self.y = self.data_bin.mapValues(
                lambda instance: new_label_mapping[instance.label])
            # set tree dimension
            self.booster_dim = self.num_classes if self.num_classes > 2 else 1

        else:
            self.y = self.data_bin.mapValues(lambda instance: instance.label)

        # set loss function
        self.loss = self.get_loss_function()

        # set y_hat_val, if warm start predict cur samples
        if self.is_warm_start:
            self.y_hat = self.predict(data_inst, ret_format='raw')
            self.boosting_round += self.start_round
            self.callback_warm_start_init_iter(self.start_round)
        else:
            self.y_hat, self.init_score = self.get_init_score(
                self.y, self.num_classes)

        # sync start round and end round
        self.sync_start_round_and_end_round()

        self.preprocess()

        LOGGER.info('begin to fit a boosting tree')
        for epoch_idx in range(self.start_round, self.boosting_round):

            LOGGER.info('cur epoch idx is {}'.format(epoch_idx))

            self.callback_list.on_epoch_begin(epoch_idx)

            for class_idx in range(self.booster_dim):

                # fit a booster
                model = self.fit_a_learner(epoch_idx, class_idx)
                booster_meta, booster_param = model.get_model()
                if booster_meta is not None and booster_param is not None:
                    self.booster_meta = booster_meta
                    self.boosting_model_list.append(booster_param)

                # update predict score
                cur_sample_weights = model.get_sample_weights()
                self.y_hat = self.get_new_predict_score(self.y_hat,
                                                        cur_sample_weights,
                                                        dim=class_idx)

            local_loss = self.compute_loss(self.y_hat, self.y)
            self.aggregator.send_local_loss(local_loss,
                                            self.data_bin.count(),
                                            suffix=(epoch_idx, ))

            validation_strategy = self.callback_list.get_validation_strategy()
            if validation_strategy:
                validation_strategy.set_precomputed_train_scores(
                    self.score_to_predict_result(data_inst, self.y_hat))
            self.callback_list.on_epoch_end(epoch_idx)

            # check stop flag if n_iter_no_change is True
            if self.n_iter_no_change:
                should_stop = self.aggregator.get_converge_status(
                    suffix=(str(epoch_idx), ))
                if should_stop:
                    LOGGER.info('n_iter_no_change stop triggered')
                    break

        self.postprocess()
        self.callback_list.on_train_end()
        self.set_summary(self.generate_summary())