Пример #1
0
    def __synchronize_encryption(self, mode='train'):
        """
        Communicate with hosts. Specify whether use encryption or not and transfer the public keys.
        """
        # 2. Send pubkey to those use-encryption guest & hosts
        encrypter = PaillierEncrypt()
        encrypter.generate_key(self.key_length)

        pub_key = encrypter.get_public_key()

        # LOGGER.debug("Start to remote pub_key: {}, transfer_id: {}".format(pub_key, pubkey_id))
        self.transfer_variable.paillier_pubkey.remote(obj=pub_key,
                                                      role=consts.GUEST,
                                                      idx=0,
                                                      suffix=(mode, ))
        LOGGER.info("send pubkey to guest")
        pri_key = encrypter.get_privacy_key()
        self.transfer_variable.paillier_prikey.remote(obj=pri_key,
                                                      role=consts.GUEST,
                                                      idx=0,
                                                      suffix=(mode, ))
        # LOGGER.debug("Start to remote pri_key: {}, transfer_id: {}".format(pri_key, prikey_id))
        LOGGER.info("send prikey to guest")
        self.transfer_variable.paillier_pubkey.remote(obj=pub_key,
                                                      role=consts.HOST,
                                                      idx=-1,
                                                      suffix=(mode, ))
        LOGGER.info("send pubkey to host")
        self.transfer_variable.paillier_prikey.remote(obj=pri_key,
                                                      role=consts.HOST,
                                                      idx=-1,
                                                      suffix=(mode, ))
        LOGGER.info("send prikey to host")
Пример #2
0
 def keygen(self, key_length, suffix=tuple()) -> dict:
     use_cipher = self._use_encrypt.get_parties(
         parties=self._client_parties, suffix=suffix)
     ciphers = dict()
     for party, use_encryption in zip(self._client_parties, use_cipher):
         if not use_encryption:
             ciphers[party] = None
         else:
             cipher = PaillierEncrypt()
             cipher.generate_key(key_length)
             pub_key = cipher.get_public_key()
             self._pailler_pubkey.remote_parties(obj=pub_key,
                                                 parties=[party],
                                                 suffix=suffix)
             ciphers[party] = cipher
     return ciphers
Пример #3
0
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning):
    def __init__(self):
        super(HeteroFeatureBinningGuest, self).__init__()

        self.encryptor = PaillierEncrypt()
        self.encryptor.generate_key()
        self.local_transform_result = None
        self.party_name = consts.GUEST
        # self._init_binning_obj()

    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        self.binning_obj.fit_split_points(data_instances)
        LOGGER.debug("After fit, binning_obj split_points: {}".format(
            self.binning_obj.split_points))

        is_binary_data = data_overview.is_binary_labels(data_instances)

        if not is_binary_data:
            LOGGER.warning("Iv is not supported for Multiple-label data.")
            # data_instances = self.fit_local(data_instances)
            return data_instances

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)

        label_table = data_instances.mapValues(lambda x: x.label)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)

        # encrypted_label_table_id = self.transfer_variable.generate_transferid(self.transfer_variable.encrypted_label)

        self.transfer_variable.encrypted_label.remote(encrypted_label_table,
                                                      role=consts.HOST,
                                                      idx=0)
        # federation.remote(encrypted_label_table, name=self.transfer_variable.encrypted_label.name,
        #                  tag=encrypted_label_table_id, role=consts.HOST, idx=0)

        LOGGER.info("Sent encrypted_label_table to host")

        # 4. Calculates self's binning. In case the other party need time to compute its data,
        #  do binning calculation at this point.
        data_instances = self.fit_local(data_instances, label_table)

        # 5. Received host result and calculate iv value

        encrypted_bin_sum = self.transfer_variable.encrypted_bin_sum.get(idx=0)

        LOGGER.info("Get encrypted_bin_sum from host")

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.model_param.adjustment_factor)

        # Support one host only in this version. Multiple host will be supported in the future.
        self.host_results[consts.HOST] = host_iv_attrs
        self.set_schema(data_instances)

        LOGGER.debug("Before transform, binning_obj split_points: {}".format(
            self.binning_obj.split_points))

        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        return self.data_output

    @staticmethod
    def encrypt(x, encryptor):
        return encryptor.encrypt(x), encryptor.encrypt(1 - x)

    def transform_local(self, data_instances, label_table=None):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)
        split_points = {}
        for col_name, iv_attr in self.binning_result.items():
            split_points[col_name] = iv_attr.split_points

        self.local_transform_result = self.binning_obj.cal_local_iv(
            data_instances, split_points=split_points, label_table=label_table)

        for col_name, col_index in self.local_transform_result.items():
            LOGGER.info("The local feature {} 's iv is {}".format(
                col_name, self.local_transform_result[col_name].iv))
        self.set_schema(data_instances)
        return data_instances

    def __synchronize_encryption(self):
        pub_key = self.encryptor.get_public_key()
        # pubkey_id = self.transfer_variable.generate_transferid(self.transfer_variable.paillier_pubkey)

        self.transfer_variable.paillier_pubkey.remote(pub_key,
                                                      role=consts.HOST,
                                                      idx=0)
        """
        federation.remote(pub_key, name=self.transfer_variable.paillier_pubkey.name,
                          tag=pubkey_id, role=consts.HOST, idx=0)
        """

        LOGGER.info("send pubkey to host")
        self.has_synchronized = True

    def __decrypt_bin_sum(self, encrypted_bin_sum):
        # for feature_sum in encrypted_bin_sum:
        for col_name, count_list in encrypted_bin_sum.items():
            new_list = []
            for encrypted_event, encrypted_non_event in count_list:
                event_count = self.encryptor.decrypt(encrypted_event)
                non_event_count = self.encryptor.decrypt(encrypted_non_event)
                new_list.append((event_count, non_event_count))
            encrypted_bin_sum[col_name] = new_list
        return encrypted_bin_sum

    def fit_local(self, data_instances, label_table=None):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        iv_attrs = self.binning_obj.cal_local_iv(data_instances,
                                                 label_table=label_table)
        self.binning_result = iv_attrs
        self.set_schema(data_instances)
        return data_instances

    @staticmethod
    def load_data(data_instance):
        # Here suppose this is a binary question and the event label is 1
        if data_instance.label != 1:
            data_instance.label = 0
        return data_instance
Пример #4
0
class HomoLRHost(HomoLRBase):
    def __init__(self):
        super(HomoLRHost, self).__init__()
        self.gradient_operator = None
        self.loss_history = []
        self.is_converged = False
        self.role = consts.HOST
        self.aggregator = aggregator.Host()
        self.model_weights = None
        self.cipher = paillier_cipher.Host()

        self.zcl_encrypt_operator = PaillierEncrypt()

    def _init_model(self, params):
        super()._init_model(params)
        self.cipher.register_paillier_cipher(self.transfer_variable)
        if params.encrypt_param.method in [consts.PAILLIER]:
            self.use_encrypt = True
            self.gradient_operator = TaylorLogisticGradient()
            self.re_encrypt_batches = params.re_encrypt_batches
        else:
            self.use_encrypt = False
            self.gradient_operator = LogisticGradient()

    def fit(self, data_instances, validate_data=None):
        LOGGER.debug("Start data count: {}".format(data_instances.count()))

        self._abnormal_detection(data_instances)
        self.init_schema(data_instances)
        validation_strategy = self.init_validation_strategy(data_instances, validate_data)

        pubkey = self.cipher.gen_paillier_pubkey(enable=self.use_encrypt, suffix=('fit',))
        if self.use_encrypt:
            self.cipher_operator.set_public_key(pubkey)

        self.model_weights = self._init_model_variables(data_instances)
        w = self.cipher_operator.encrypt_list(self.model_weights.unboxed)
        self.model_weights = LogisticRegressionWeights(w, self.model_weights.fit_intercept)

        LOGGER.debug("After init, model_weights: {}".format(self.model_weights.unboxed))

        mini_batch_obj = MiniBatch(data_inst=data_instances, batch_size=self.batch_size)

        total_batch_num = mini_batch_obj.batch_nums

        if self.use_encrypt:
            re_encrypt_times = total_batch_num // self.re_encrypt_batches + 1
            LOGGER.debug("re_encrypt_times is :{}, batch_size: {}, total_batch_num: {}, re_encrypt_batches: {}".format(
                re_encrypt_times, self.batch_size, total_batch_num, self.re_encrypt_batches))
            self.cipher.set_re_cipher_time(re_encrypt_times)

        total_data_num = data_instances.count()
        LOGGER.debug("Current data count: {}".format(total_data_num))

        model_weights = self.model_weights
        degree = 0

        self.__synchronize_encryption()
        self.zcl_idx, self.zcl_num_party = self.transfer_variable.num_party.get(idx=0, suffix=('train',))
        LOGGER.debug("party num:" + str(self.zcl_num_party))
        self.__init_model()

        self.train_loss_results = []
        self.train_accuracy_results = []
        self.test_loss_results = []
        self.test_accuracy_results = []

        for iter_num in range(self.max_iter):
            # mini-batch
            LOGGER.debug("In iter: {}".format(iter_num))
            # batch_data_generator = self.mini_batch_obj.mini_batch_data_generator()
            batch_num = 0
            total_loss = 0
            epoch_train_loss_avg = tfe.metrics.Mean()
            epoch_train_accuracy = tfe.metrics.Accuracy()

            for train_x, train_y in self.zcl_dataset:
                LOGGER.info("Staring batch {}".format(batch_num))
                start_t = time.time()
                loss_value, grads = self.__grad(self.zcl_model, train_x, train_y)
                loss_value = loss_value.numpy()
                grads = [x.numpy() for x in grads]
                LOGGER.info("Start encrypting")
                loss_value = batch_encryption.encrypt(self.zcl_encrypt_operator.get_public_key(), loss_value)
                grads = [batch_encryption.encrypt_matrix(self.zcl_encrypt_operator.get_public_key(), x) for x in grads]
                LOGGER.info("Finish encrypting")
                grads = Gradients(grads)
                self.transfer_variable.host_grad.remote(obj=grads.for_remote(), role=consts.ARBITER, idx=0, suffix=(iter_num, batch_num))
                LOGGER.info("Sent grads")
                self.transfer_variable.host_loss.remote(obj=loss_value, role=consts.ARBITER, idx=0, suffix=(iter_num, batch_num))
                LOGGER.info("Sent loss")

                sum_grads = self.transfer_variable.aggregated_grad.get(idx=0, suffix=(iter_num, batch_num))
                LOGGER.info("Got grads")
                sum_loss = self.transfer_variable.aggregated_loss.get(idx=0, suffix=(iter_num, batch_num))
                LOGGER.info("Got loss")

                sum_loss = batch_encryption.decrypt(self.zcl_encrypt_operator.get_privacy_key(), sum_loss)
                sum_grads = [
                    batch_encryption.decrypt_matrix(self.zcl_encrypt_operator.get_privacy_key(), x).astype(np.float32)
                    for x
                    in sum_grads.unboxed]
                LOGGER.info("Finish decrypting")

                # sum_grads = np.array(sum_grads) / self.zcl_num_party

                self.zcl_optimizer.apply_gradients(zip(sum_grads, self.zcl_model.trainable_variables),
                                                   self.zcl_global_step)

                elapsed_time = time.time() - start_t
                # epoch_train_loss_avg(loss_value)
                # epoch_train_accuracy(tf.argmax(self.zcl_model(train_x), axis=1, output_type=tf.int32),
                #                      train_y)
                self.train_loss_results.append(sum_loss)
                train_accuracy_v = accuracy_score(train_y,
                                                  tf.argmax(self.zcl_model(train_x), axis=1, output_type=tf.int32))
                self.train_accuracy_results.append(train_accuracy_v)
                test_loss_v = self.__loss(self.zcl_model, self.zcl_x_test, self.zcl_y_test)
                self.test_loss_results.append(test_loss_v)
                test_accuracy_v = accuracy_score(self.zcl_y_test,
                                                 tf.argmax(self.zcl_model(self.zcl_x_test), axis=1,
                                                           output_type=tf.int32))
                self.test_accuracy_results.append(test_accuracy_v)

                LOGGER.info(
                    "Epoch {:03d}, iteration {:03d}: train_loss: {:.3f}, train_accuracy: {:.3%}, test_loss: {:.3f}, "
                    "test_accuracy: {:.3%}, elapsed_time: {:.4f}".format(
                        iter_num,
                        batch_num,
                        sum_loss,
                        train_accuracy_v,
                        test_loss_v,
                        test_accuracy_v,
                        elapsed_time)
                )

                batch_num += 1

                if batch_num >= self.zcl_early_stop_batch:
                    return

            self.n_iter_ = iter_num

    def __synchronize_encryption(self, mode='train'):
        """
        Communicate with hosts. Specify whether use encryption or not and transfer the public keys.
        """
        pub_key = self.transfer_variable.paillier_pubkey.get(idx=0, suffix=(mode,))
        LOGGER.debug("Received pubkey")
        self.zcl_encrypt_operator.set_public_key(pub_key)
        pri_key = self.transfer_variable.paillier_prikey.get(idx=0, suffix=(mode,))
        LOGGER.debug("Received prikey")
        self.zcl_encrypt_operator.set_privacy_key(pri_key)

    def __init_model(self):
        # self.zcl_model = keras.Sequential([
        #     keras.layers.Flatten(input_shape=(28, 28)),
        #     keras.layers.Dense(128, activation=tf.nn.relu),
        #     keras.layers.Dense(10, activation=tf.nn.softmax)
        # ])
        #
        # LOGGER.info("Initialed model")
        json_file = open(MODEL_JSON_DIR, 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = keras.models.model_from_json(loaded_model_json)
        loaded_model.load_weights(MODEL_WEIGHT_DIR)
        self.zcl_model = loaded_model
        LOGGER.info("Initialed model")

        # The data, split between train and test sets:
        (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()

        x_train = x_train.astype('float32')
        x_test = x_test.astype('float32')
        x_train /= 255.0
        x_test /= 255.0
        y_train = y_train.squeeze().astype(np.int32)
        y_test = y_test.squeeze().astype(np.int32)

        avg_length = int(len(x_train) / self.zcl_num_party)
        split_idx = [_ * avg_length for _ in range(1, self.zcl_num_party)]
        x_train = np.split(x_train, split_idx)[self.zcl_idx]
        y_train = np.split(y_train, split_idx)[self.zcl_idx]

        train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
        BATCH_SIZE = 128
        SHUFFLE_BUFFER_SIZE = 1000
        train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE, reshuffle_each_iteration=True).batch(BATCH_SIZE)
        self.zcl_dataset = train_dataset
        self.zcl_x_test = x_test
        self.zcl_y_test = y_test

        self.zcl_cce = tf.keras.losses.SparseCategoricalCrossentropy()
        self.zcl_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
        self.zcl_global_step = tf.Variable(0)

    def __loss(self, model, x, y):
        y_ = model(x)
        return self.zcl_cce(y_true=y, y_pred=y_)

    def __grad(self, model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = self.__loss(model, inputs, targets)
        return loss_value, tape.gradient(loss_value, model.trainable_variables)

    def __clip_gradients(self, grads, min_v, max_v):
        results = [tf.clip_by_value(t, min_v, max_v).numpy() for t in grads]
        return results

    def predict(self, data_instances):

        LOGGER.info(f'Start predict task')
        self._abnormal_detection(data_instances)
        self.init_schema(data_instances)
        suffix = ('predict',)
        pubkey = self.cipher.gen_paillier_pubkey(enable=self.use_encrypt, suffix=suffix)
        if self.use_encrypt:
            self.cipher_operator.set_public_key(pubkey)

        if self.use_encrypt:
            final_model = self.transfer_variable.aggregated_model.get(idx=0, suffix=suffix)
            model_weights = LogisticRegressionWeights(final_model.unboxed, self.fit_intercept)
            wx = self.compute_wx(data_instances, model_weights.coef_, model_weights.intercept_)
            self.transfer_variable.predict_wx.remote(wx, consts.ARBITER, 0, suffix=suffix)
            predict_result = self.transfer_variable.predict_result.get(idx=0, suffix=suffix)
            predict_result = predict_result.join(data_instances, lambda p, d: [d.label, p, None,
                                                                                     {"0": None, "1": None}])

        else:
            predict_wx = self.compute_wx(data_instances, self.model_weights.coef_, self.model_weights.intercept_)
            pred_table = self.classify(predict_wx, self.model_param.predict_param.threshold)
            predict_result = data_instances.mapValues(lambda x: x.label)
            predict_result = pred_table.join(predict_result, lambda x, y: [y, x[1], x[0],
                                                                           {"1": x[0], "0": 1 - x[0]}])
        return predict_result

    def _get_param(self):
        header = self.header

        weight_dict = {}
        intercept = 0
        if not self.use_encrypt:
            lr_vars = self.model_weights.coef_
            for idx, header_name in enumerate(header):
                coef_i = lr_vars[idx]
                weight_dict[header_name] = coef_i
            intercept = self.model_weights.intercept_

        param_protobuf_obj = lr_model_param_pb2.LRModelParam(iters=self.n_iter_,
                                                             loss_history=self.loss_history,
                                                             is_converged=self.is_converged,
                                                             weight=weight_dict,
                                                             intercept=intercept,
                                                             header=header)
        from google.protobuf import json_format
        json_result = json_format.MessageToJson(param_protobuf_obj)
        LOGGER.debug("json_result: {}".format(json_result))
        return param_protobuf_obj
Пример #5
0
class HomoLRGuest(HomoLRBase):
    def __init__(self):
        super(HomoLRGuest, self).__init__()
        self.gradient_operator = LogisticGradient()
        self.loss_history = []
        self.role = consts.GUEST
        self.aggregator = aggregator.Guest()

        self.zcl_encrypt_operator = PaillierEncrypt()

    def _init_model(self, params):
        super()._init_model(params)

    def fit(self, data_instances, validate_data=None):

        self._abnormal_detection(data_instances)
        self.init_schema(data_instances)

        validation_strategy = self.init_validation_strategy(
            data_instances, validate_data)
        self.model_weights = self._init_model_variables(data_instances)

        max_iter = self.max_iter
        total_data_num = data_instances.count()
        mini_batch_obj = MiniBatch(data_inst=data_instances,
                                   batch_size=self.batch_size)
        model_weights = self.model_weights

        self.__synchronize_encryption()
        self.zcl_idx, self.zcl_num_party = self.transfer_variable.num_party.get(
            idx=0, suffix=('train', ))
        LOGGER.debug("party num:" + str(self.zcl_num_party))
        self.__init_model()

        self.train_loss_results = []
        self.train_accuracy_results = []
        self.test_loss_results = []
        self.test_accuracy_results = []

        for iter_num in range(self.max_iter):
            total_loss = 0
            batch_num = 0
            epoch_train_loss_avg = tfe.metrics.Mean()
            epoch_train_accuracy = tfe.metrics.Accuracy()

            for train_x, train_y in self.zcl_dataset:
                LOGGER.info("Staring batch {}".format(batch_num))
                start_t = time.time()
                loss_value, grads = self.__grad(self.zcl_model, train_x,
                                                train_y)
                loss_value = loss_value.numpy()
                grads = [x.numpy() for x in grads]
                LOGGER.info("Start encrypting")
                loss_value = batch_encryption.encrypt(
                    self.zcl_encrypt_operator.get_public_key(), loss_value)
                grads = [
                    batch_encryption.encrypt_matrix(
                        self.zcl_encrypt_operator.get_public_key(), x)
                    for x in grads
                ]
                grads = Gradients(grads)
                LOGGER.info("Finish encrypting")
                # grads = self.encrypt_operator.get_public_key()
                self.transfer_variable.guest_grad.remote(
                    obj=grads.for_remote(),
                    role=consts.ARBITER,
                    idx=0,
                    suffix=(iter_num, batch_num))
                LOGGER.info("Sent grads")
                self.transfer_variable.guest_loss.remote(obj=loss_value,
                                                         role=consts.ARBITER,
                                                         idx=0,
                                                         suffix=(iter_num,
                                                                 batch_num))
                LOGGER.info("Sent loss")

                sum_grads = self.transfer_variable.aggregated_grad.get(
                    idx=0, suffix=(iter_num, batch_num))
                LOGGER.info("Got grads")
                sum_loss = self.transfer_variable.aggregated_loss.get(
                    idx=0, suffix=(iter_num, batch_num))
                LOGGER.info("Got loss")

                sum_loss = batch_encryption.decrypt(
                    self.zcl_encrypt_operator.get_privacy_key(), sum_loss)
                sum_grads = [
                    batch_encryption.decrypt_matrix(
                        self.zcl_encrypt_operator.get_privacy_key(),
                        x).astype(np.float32) for x in sum_grads.unboxed
                ]
                LOGGER.info("Finish decrypting")

                # sum_grads = np.array(sum_grads) / self.zcl_num_party

                self.zcl_optimizer.apply_gradients(
                    zip(sum_grads, self.zcl_model.trainable_variables),
                    self.zcl_global_step)

                elapsed_time = time.time() - start_t
                # epoch_train_loss_avg(loss_value)
                # epoch_train_accuracy(tf.argmax(self.zcl_model(train_x), axis=1, output_type=tf.int32),
                #                      train_y)
                self.train_loss_results.append(sum_loss)
                train_accuracy_v = accuracy_score(
                    train_y,
                    tf.argmax(self.zcl_model(train_x),
                              axis=1,
                              output_type=tf.int32))
                self.train_accuracy_results.append(train_accuracy_v)
                test_loss_v = self.__loss(self.zcl_model, self.zcl_x_test,
                                          self.zcl_y_test)
                self.test_loss_results.append(test_loss_v)
                test_accuracy_v = accuracy_score(
                    self.zcl_y_test,
                    tf.argmax(self.zcl_model(self.zcl_x_test),
                              axis=1,
                              output_type=tf.int32))
                self.test_accuracy_results.append(test_accuracy_v)

                LOGGER.info(
                    "Epoch {:03d}, iteration {:03d}: train_loss: {:.3f}, train_accuracy: {:.3%}, test_loss: {:.3f}, "
                    "test_accuracy: {:.3%}, elapsed_time: {:.4f}".format(
                        iter_num, batch_num, sum_loss, train_accuracy_v,
                        test_loss_v, test_accuracy_v, elapsed_time))

                batch_num += 1

                if batch_num >= self.zcl_early_stop_batch:
                    return

            self.n_iter_ = iter_num

    def __synchronize_encryption(self, mode='train'):
        """
        Communicate with hosts. Specify whether use encryption or not and transfer the public keys.
        """
        pub_key = self.transfer_variable.paillier_pubkey.get(idx=0,
                                                             suffix=(mode, ))
        LOGGER.debug("Received pubkey")
        self.zcl_encrypt_operator.set_public_key(pub_key)
        pri_key = self.transfer_variable.paillier_prikey.get(idx=0,
                                                             suffix=(mode, ))
        LOGGER.debug("Received prikey")
        self.zcl_encrypt_operator.set_privacy_key(pri_key)

    def __init_model(self):
        # self.zcl_model = keras.Sequential([
        #     keras.layers.Flatten(input_shape=(28, 28)),
        #     keras.layers.Dense(128, activation=tf.nn.relu),
        #     keras.layers.Dense(10, activation=tf.nn.softmax)
        # ])
        #
        json_file = open(MODEL_JSON_DIR, 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = keras.models.model_from_json(loaded_model_json)
        loaded_model.load_weights(MODEL_WEIGHT_DIR)
        self.zcl_model = loaded_model
        LOGGER.info("Initialed model")

        # The data, split between train and test sets:
        (x_train,
         y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()

        x_train = x_train.astype('float32')
        x_test = x_test.astype('float32')
        x_train /= 255.0
        x_test /= 255.0
        y_train = y_train.squeeze().astype(np.int32)
        y_test = y_test.squeeze().astype(np.int32)

        avg_length = int(len(x_train) / self.zcl_num_party)
        split_idx = [_ * avg_length for _ in range(1, self.zcl_num_party)]
        x_train = np.split(x_train, split_idx)[self.zcl_idx]
        y_train = np.split(y_train, split_idx)[self.zcl_idx]

        train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
        BATCH_SIZE = 128
        SHUFFLE_BUFFER_SIZE = 1000
        train_dataset = train_dataset.shuffle(
            SHUFFLE_BUFFER_SIZE,
            reshuffle_each_iteration=True).batch(BATCH_SIZE)
        self.zcl_dataset = train_dataset
        self.zcl_x_test = x_test
        self.zcl_y_test = y_test

        self.zcl_cce = tf.keras.losses.SparseCategoricalCrossentropy()
        self.zcl_optimizer = tf.train.AdamOptimizer(
            learning_rate=LEARNING_RATE)
        self.zcl_global_step = tf.Variable(0)

    def __loss(self, model, x, y):
        y_ = model(x)
        return self.zcl_cce(y_true=y, y_pred=y_)

    def __grad(self, model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = self.__loss(model, inputs, targets)
        return loss_value, tape.gradient(loss_value, model.trainable_variables)

    def __clip_gradients(self, grads, min_v, max_v):
        results = [tf.clip_by_value(t, min_v, max_v).numpy() for t in grads]
        return results

    def predict(self, data_instances):
        self._abnormal_detection(data_instances)
        self.init_schema(data_instances)
        predict_wx = self.compute_wx(data_instances, self.model_weights.coef_,
                                     self.model_weights.intercept_)

        pred_table = self.classify(predict_wx,
                                   self.model_param.predict_param.threshold)

        predict_result = data_instances.mapValues(lambda x: x.label)
        predict_result = pred_table.join(
            predict_result,
            lambda x, y: [y, x[1], x[0], {
                "1": x[0],
                "0": 1 - x[0]
            }])
        return predict_result
 def setUp(self):
     paillierEncrypt = PaillierEncrypt()
     paillierEncrypt.generate_key()
     self.publickey = paillierEncrypt.get_public_key()
     self.privatekey = paillierEncrypt.get_privacy_key()
Пример #7
0
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning):
    def __init__(self, params: FeatureBinningParam):
        super(HeteroFeatureBinningGuest, self).__init__(params)

        self.encryptor = PaillierEncrypt()
        self.encryptor.generate_key()
        self.local_transform_result = None
        self.party_name = consts.GUEST
        self._init_binning_obj()

    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns.
        """
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)

        label_table = data_instances.mapValues(lambda x: x.label)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)

        encrypted_label_table_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_label)
        federation.remote(encrypted_label_table,
                          name=self.transfer_variable.encrypted_label.name,
                          tag=encrypted_label_table_id,
                          role=consts.HOST,
                          idx=0)

        LOGGER.info("Sent encrypted_label_table to host")

        # 4. Calculates self's binning. In case the other party need time to compute its data,
        #  do binning calculation at this point.
        data_instances = self.fit_local(data_instances, label_table)

        # 5. Received host result and calculate iv value
        encrypted_bin_sum_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_bin_sum)

        encrypted_bin_sum = federation.get(
            name=self.transfer_variable.encrypted_bin_sum.name,
            tag=encrypted_bin_sum_id,
            idx=0)

        LOGGER.info("Get encrypted_bin_sum from host")

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.bin_param.adjustment_factor)

        # Support one host only in this version. Multiple host will be supported in the future.
        self.host_results[consts.HOST] = host_iv_attrs

        for cols_name, iv_attr in host_iv_attrs.items():
            display_result = iv_attr.display_result(
                self.bin_param.display_result)
            LOGGER.info(
                "[Result][FeatureBinning][Host] feature {} 's result is : {}".
                format(cols_name, display_result))

        self.set_schema(data_instances)
        return data_instances

    def transform(self, data_instances):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        label_table = data_instances.mapValues(lambda x: x.label)
        self.set_schema(data_instances)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)
        encrypted_label_table_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_label)
        federation.remote(encrypted_label_table,
                          name=self.transfer_variable.encrypted_label.name,
                          tag=encrypted_label_table_id,
                          role=consts.HOST,
                          idx=0)
        LOGGER.info("Sent encrypted_label_table to host for transform")

        # 4. Transform locally
        self.transform_local(data_instances,
                             label_table=label_table,
                             save_result=False)

        # 5. Received host result and calculate iv value
        encrypted_bin_sum_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_bin_sum)
        encrypted_bin_sum = federation.get(
            name=self.transfer_variable.encrypted_bin_sum.name,
            tag=encrypted_bin_sum_id,
            idx=0)

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.bin_param.adjustment_factor)
        # host_results = {'host1': host_iv_attrs}

        # self.save_model(name=self.bin_param.transform_table,
        #                 namespace=self.bin_param.result_namespace,
        #                 binning_result=self.local_transform_result,
        #                 host_results=host_results)

        for col_name, iv_attr in host_iv_attrs.items():
            LOGGER.info("The remote feature {} 's iv is {}".format(
                col_name, iv_attr.iv))

        self.set_schema(data_instances)
        return data_instances

    @staticmethod
    def encrypt(x, encryptor):
        return encryptor.encrypt(x), encryptor.encrypt(1 - x)

    def transform_local(self,
                        data_instances,
                        label_table=None,
                        save_result=True):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)
        split_points = {}
        for col_name, iv_attr in self.binning_result.items():
            split_points[col_name] = iv_attr.split_points

        self.local_transform_result = self.binning_obj.cal_local_iv(
            data_instances, split_points=split_points, label_table=label_table)

        if save_result:
            self.save_model(name=self.bin_param.transform_table,
                            namespace=self.bin_param.result_namespace,
                            binning_result=self.local_transform_result,
                            host_results={})
        for col_name, col_index in self.local_transform_result.items():
            LOGGER.info("The local feature {} 's iv is {}".format(
                col_name, self.local_transform_result[col_name].iv))
        self.set_schema(data_instances)
        return data_instances

    def __synchronize_encryption(self):
        pub_key = self.encryptor.get_public_key()
        pubkey_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.paillier_pubkey)

        federation.remote(pub_key,
                          name=self.transfer_variable.paillier_pubkey.name,
                          tag=pubkey_id,
                          role=consts.HOST,
                          idx=0)

        LOGGER.info("send pubkey to host")
        self.has_synchronized = True

    def __decrypt_bin_sum(self, encrypted_bin_sum):
        # for feature_sum in encrypted_bin_sum:
        for col_name, count_list in encrypted_bin_sum.items():
            new_list = []
            for encrypted_event, encrypted_non_event in count_list:
                event_count = self.encryptor.decrypt(encrypted_event)
                non_event_count = self.encryptor.decrypt(encrypted_non_event)
                new_list.append((event_count, non_event_count))
            encrypted_bin_sum[col_name] = new_list
        return encrypted_bin_sum

    def fit_local(self, data_instances, label_table=None):
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        iv_attrs = self.binning_obj.cal_local_iv(data_instances,
                                                 label_table=label_table)
        for col_name, iv_attr in iv_attrs.items():
            display_result = iv_attr.display_result(
                self.bin_param.display_result)
            LOGGER.info(
                "[Result][FeatureBinning][Guest] feature {} 's result is : {}".
                format(col_name, display_result))
            # LOGGER.info("[Result][FeatureBinning]The feature {} 's iv is {}".format(col_name, iv_attrs[col_name].iv))
        self.binning_result = iv_attrs
        self.set_schema(data_instances)
        return data_instances

    @staticmethod
    def load_data(data_instance):
        # Here suppose this is a binary question and the event label is 1
        if data_instance.label != 1:
            data_instance.label = 0
        return data_instance
Пример #8
0
class HeteroFeatureBinningGuest(BaseHeteroFeatureBinning):
    def __init__(self, params: FeatureBinningParam):
        super(HeteroFeatureBinningGuest, self).__init__(params)

        self.encryptor = PaillierEncrypt()
        self.encryptor.generate_key()
        self.iv_attrs = None
        self.host_iv_attrs = None

    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns.
        """
        self._abnormal_detection(data_instances)
        self._parse_cols(data_instances)

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        label_table = data_instances.mapValues(lambda x: x.label)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)

        encrypted_label_table_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_label)
        federation.remote(encrypted_label_table,
                          name=self.transfer_variable.encrypted_label.name,
                          tag=encrypted_label_table_id,
                          role=consts.HOST,
                          idx=0)

        LOGGER.info("Sent encrypted_label_table to host")

        # 4. Calculates self's binning. In case the other party need time to compute its data,
        #  do binning calculation at this point.
        local_iv = self.fit_local(data_instances, label_table)

        # 5. Received host result and calculate iv value
        encrypted_bin_sum_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_bin_sum)

        encrypted_bin_sum = federation.get(
            name=self.transfer_variable.encrypted_bin_sum.name,
            tag=encrypted_bin_sum_id,
            idx=0)

        LOGGER.info("Get encrypted_bin_sum from host")

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.bin_param.adjustment_factor)
        self.host_iv_attrs = host_iv_attrs
        # LOGGER.debug("Lenght of host iv attrs: {}".format(len(self.host_iv_attrs)))
        # for idx, col in enumerate(self.cols):
        #     LOGGER.info("The local iv of {}th feature is {}".format(col, local_iv[idx].iv))

        for idx, iv_attr in enumerate(host_iv_attrs):
            LOGGER.info("The remote iv of {}th measured feature is {}".format(
                idx, iv_attr.iv))

        iv_result = {'local': local_iv, 'remote': host_iv_attrs}

        return iv_result

    def transform(self, data_instances):
        self._abnormal_detection(data_instances)

        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]

        self._parse_cols(data_instances)

        # 1. Synchronize encryption information
        self.__synchronize_encryption()

        # 2. Prepare labels
        data_instances = data_instances.mapValues(self.load_data)
        label_table = data_instances.mapValues(lambda x: x.label)

        # 3. Transfer encrypted label
        f = functools.partial(self.encrypt, encryptor=self.encryptor)
        encrypted_label_table = label_table.mapValues(f)
        encrypted_label_table_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_label)
        federation.remote(encrypted_label_table,
                          name=self.transfer_variable.encrypted_label.name,
                          tag=encrypted_label_table_id,
                          role=consts.HOST,
                          idx=0)
        LOGGER.info("Sent encrypted_label_table to host for transform")

        # 4. Transform locally
        self.transform_local(data_instances, reformated=True)

        # 5. Received host result and calculate iv value
        encrypted_bin_sum_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.encrypted_bin_sum)
        encrypted_bin_sum = federation.get(
            name=self.transfer_variable.encrypted_bin_sum.name,
            tag=encrypted_bin_sum_id,
            idx=0)

        result_counts = self.__decrypt_bin_sum(encrypted_bin_sum)
        host_iv_attrs = self.binning_obj.cal_iv_woe(
            result_counts, self.bin_param.adjustment_factor)
        self.host_iv_attrs = host_iv_attrs
        for idx, iv_attr in enumerate(host_iv_attrs):
            LOGGER.info("The remote iv of {}th measured feature is {}".format(
                idx, iv_attr.iv))

        data_instances.schema['header'] = self.header
        return data_instances

    @staticmethod
    def encrypt(x, encryptor):
        return encryptor.encrypt(x), encryptor.encrypt(1 - x)

    def transform_local(self, data_instances, reformated=False):
        self._abnormal_detection(data_instances)

        self._parse_cols(data_instances)

        if not reformated:  # Reformat the label type
            data_instances = data_instances.mapValues(self.load_data)

        split_points = []
        for iv_attr in self.iv_attrs:
            s_p = list(iv_attr.split_points)
            split_points.append(s_p)

        self.iv_attrs = self.binning_obj.cal_local_iv(data_instances,
                                                      self.cols, split_points)
        for idx, col in enumerate(self.cols):
            LOGGER.info("The local iv of {}th feature is {}".format(
                col, self.iv_attrs[idx].iv))

    def __synchronize_encryption(self):
        pub_key = self.encryptor.get_public_key()
        pubkey_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.paillier_pubkey)
        # LOGGER.debug("pubkey_id is : {}".format(pubkey_id))

        federation.remote(pub_key,
                          name=self.transfer_variable.paillier_pubkey.name,
                          tag=pubkey_id,
                          role=consts.HOST,
                          idx=0)

        LOGGER.info("send pubkey to host")
        self.has_synchronized = True

    def __decrypt_bin_sum(self, encrypted_bin_sum):
        for feature_sum in encrypted_bin_sum:
            for idx, (encrypted_event,
                      encrypted_non_event) in enumerate(feature_sum):
                event_count = self.encryptor.decrypt(encrypted_event)
                non_event_count = self.encryptor.decrypt(encrypted_non_event)
                feature_sum[idx] = (event_count, non_event_count)
        return encrypted_bin_sum

    def fit_local(self, data_instances, label_table=None):
        self._abnormal_detection(data_instances)

        self._parse_cols(data_instances)

        iv_attrs = self.binning_obj.cal_local_iv(data_instances,
                                                 self.cols,
                                                 label_table=label_table)
        for idx, col in enumerate(self.cols):
            LOGGER.info("The local iv of {}th feature is {}".format(
                col, iv_attrs[idx].iv))
        self.iv_attrs = iv_attrs
        return iv_attrs

    @staticmethod
    def load_data(data_instance):
        # Here suppose this is a binary question and the event label is 1
        # LOGGER.debug('label type is {}'.format(type(data_instance.label)))
        if data_instance.label != 1:
            data_instance.label = 0
        return data_instance