Пример #1
0
 def setUp(self):
     session.init("test_cross_entropy")
     self.softmax_loss = SoftmaxCrossEntropyLoss()
     self.y_list = [i % 5 for i in range(100)]
     self.predict_list = [np.array([random.random() for i in range(5)]) for j in range(100)]
     self.y = session.parallelize(self.y_list, include_key=False, partition=16)
     self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt)
        self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest()

        size = 10
        self.wx = session.parallelize(
            [self.paillier_encrypt.encrypt(i) for i in range(size)])
        self.en_sum_wx_square = session.parallelize(
            [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)])
        self.w = [i for i in range(size)]
        self.data_inst = session.parallelize([
            Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2))
            for i in range(size)
        ],
                                             partition=1)

        # test fore_gradient
        self.fore_gradient_local = [
            -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75
        ]
        # test gradient
        self.gradient = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125
        ]
        self.gradient_fit_intercept = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125, 1.125
        ]

        self.loss = 4.505647
Пример #3
0
 def setUp(self):
     session.init("test_random_sampler")
     self.data = [(i * 10 + 5, i * i) for i in range(100)]
     self.table = session.parallelize(self.data, include_key=True)
     self.data_to_trans = [(i * 10 + 5, i * i * i) for i in range(100)]
     self.table_trans = session.parallelize(self.data_to_trans,
                                            include_key=True)
Пример #4
0
 def setUp(self):
     session.init("test_cross_entropy")
     self.sigmoid_loss = SigmoidBinaryCrossEntropyLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = session.parallelize(self.y_list, include_key=False, partition=16)
     self.predict = session.parallelize(self.predict_list, include_key=False, partition=16)
Пример #5
0
    def setUp(self):
        self.data = []
        self.data_with_value = []
        for i in range(100):
            row = []
            row_with_value = []
            for j in range(100):
                if random.randint(1, 100) > 30:
                    continue
                str_r = ''.join(
                    random.sample(string.ascii_letters + string.digits, 10))
                row.append(str_r)
                row_with_value.append(str_r + ':' + str(random.random()))

            self.data.append((i, ' '.join(row)))
            self.data_with_value.append((i, ' '.join(row_with_value)))

        self.table1 = session.parallelize(self.data,
                                          include_key=True,
                                          partition=16)
        self.table2 = session.parallelize(self.data_with_value,
                                          include_key=True,
                                          partition=16)
        self.args1 = {"data": {"data_io_0": {"data": self.table1}}}
        self.args2 = {"data": {"data_io_1": {"data": self.table2}}}

        self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
Пример #6
0
    def setUp(self):
        session.init("test_instance")

        dense_inst = []
        headers = ['x' + str(i) for i in range(20)]
        for i in range(100):
            inst = Instance(features=(i % 16 * np.ones(20)))
            dense_inst.append((i, inst))
        self.dense_table = session.parallelize(dense_inst, include_key=True, partition=2)
        self.dense_table.schema = {'header': headers}

        self.sparse_inst = []
        for i in range(100):
            dict = {}
            indices = []
            data = []
            for j in range(20):
                idx = random.randint(0, 29)
                if idx in dict:
                    continue
                dict[idx] = 1
                val = random.random()
                indices.append(idx)
                data.append(val)

            sparse_vec = SparseVector(indices, data, 30)
            self.sparse_inst.append((i, Instance(features=sparse_vec)))

        self.sparse_table = session.parallelize(self.sparse_inst, include_key=True)
        self.sparse_table.schema = {"header": ["fid" + str(i) for i in range(30)]}
    def setUp(self):
        self.feature_histogram = FeatureHistogram()
        session.init("test_feature_histogram")
        data_insts = []
        for i in range(1000):
            indices = []
            data = []
            for j in range(10):
                x = random.randint(0, 5)
                if x != 0:
                    data.append(x)
                    indices.append(j)
            sparse_vec = SparseVector(indices, data, shape=10)
            data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3))))
        self.node_map = {0: 0, 1: 1, 2: 2, 3: 3}
        self.data_insts = data_insts
        self.data_bin = session.parallelize(data_insts, include_key=False, partition=16)

        self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)]
        self.grad_and_hess = session.parallelize(self.grad_and_hess_list, include_key=False, partition=16)

        bin_split_points = []
        for i in range(10):
            bin_split_points.append(np.array([i for i in range(5)]))
        self.bin_split_points = np.array(bin_split_points)
        self.bin_sparse = [0 for i in range(10)]
Пример #8
0
 def setUp(self):
     session.init("test_least_abs_error_loss")
     self.lae_loss = LeastAbsoluteErrorLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = session.parallelize(self.y_list, include_key=False)
     self.predict = session.parallelize(self.predict_list, include_key=False)
Пример #9
0
 def setUp(self):
     session.init("test_fair_loss")
     self.log_cosh_loss = LogCoshLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = session.parallelize(self.y_list, include_key=False)
     self.predict = session.parallelize(self.predict_list, include_key=False)
Пример #10
0
    def predict(self, data_inst):
        data = self.data_converter.convert(data_inst,
                                           batch_size=self.batch_size,
                                           task_type=self.task_type)  #change
        predict = self.nn_model.predict(data)
        num_output_units = predict.shape[1]

        threshold = self.param.predict_param.threshold

        if num_output_units == 1:
            kv = [(x[0], (0 if x[1][0] <= threshold else 1, x[1][0].item()))
                  for x in zip(data.get_keys(), predict)]
            pred_tbl = session.parallelize(kv, include_key=True)
            return data_inst.join(
                pred_tbl, lambda d, pred:
                [d.label, pred[0], pred[1], {
                    "label": pred[0]
                }])
        else:
            kv = [(x[0], (x[1].argmax(), [float(e) for e in x[1]]))
                  for x in zip(data.get_keys(), predict)]
            pred_tbl = session.parallelize(kv, include_key=True)
            return data_inst.join(
                pred_tbl, lambda d, pred: [
                    d.label, pred[0].item(), pred[1][pred[0]] / sum(pred[1]), {
                        "raw_predict": pred[1]
                    }
                ])
Пример #11
0
    def setUp(self):
        session.init("test_encrypt_mode_calculator")

        self.list_data = []
        self.tuple_data = []
        self.numpy_data = []

        for i in range(30):
            list_value = [100 * i + j for j in range(20)]
            tuple_value = tuple(list_value)
            numpy_value = np.array(list_value, dtype="int")

            self.list_data.append(list_value)
            self.tuple_data.append(tuple_value)
            self.numpy_data.append(numpy_value)

        self.data_list = session.parallelize(self.list_data,
                                             include_key=False,
                                             partition=10)
        self.data_tuple = session.parallelize(self.tuple_data,
                                              include_key=False,
                                              partition=10)
        self.data_numpy = session.parallelize(self.numpy_data,
                                              include_key=False,
                                              partition=10)
Пример #12
0
    def predict(self, data_inst):

        data = self.data_converter.convert(data_inst,
                                           batch_size=self.batch_size,
                                           encode_label=self.encode_label)
        predict = self.nn_model.predict(data)
        num_output_units = predict.shape[1]
        threshold = self.param.predict_param.threshold

        if num_output_units == 1:
            kv = [(x[0], (0 if x[1][0] <= threshold else 1, x[1][0].item()))
                  for x in zip(data.get_keys(), predict)]
            pred_tbl = session.parallelize(
                kv, include_key=True, partition=data_inst.get_partitions())
            return data_inst.join(
                pred_tbl, lambda d, pred:
                [d.label, pred[0], pred[1], {
                    "0": 1 - pred[1],
                    "1": pred[1]
                }])
        else:
            kv = [(x[0], (x[1].argmax(), [float(e) for e in x[1]]))
                  for x in zip(data.get_keys(), predict)]
            pred_tbl = session.parallelize(
                kv, include_key=True, partition=data_inst.get_partitions())
            return data_inst.join(
                pred_tbl, lambda d, pred: [
                    d.label, pred[0].item(), pred[1][pred[0]],
                    {str(v): pred[1][v]
                     for v in range(len(pred[1]))}
                ])
Пример #13
0
 def setUp(self):
     session.init("test_huber_loss")
     self.delta = 1
     self.huber_loss = HuberLoss(self.delta)
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = session.parallelize(self.y_list, include_key=False)
     self.predict = session.parallelize(self.predict_list, include_key=False)
Пример #14
0
 def setUp(self):
     session.init("test_fair_loss")
     self.rho = 0.5
     self.tweedie_loss = TweedieLoss(self.rho)
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = session.parallelize(self.y_list, include_key=False)
     self.predict = session.parallelize(self.predict_list, include_key=False)
Пример #15
0
 def save_eval_result(self, eval_data):
     session.parallelize(
         [eval_data],
         include_key=False,
         name=self.workflow_param.evaluation_output_table,
         namespace=self.workflow_param.evaluation_output_namespace,
         error_if_exist=False,
         persistent=True)
    def setUp(self):
        session.init("test_label_checker")

        self.small_label_set = [Instance(label=i % 5) for i in range(100)]
        self.classify_inst = session.parallelize(self.small_label_set, include_key=False)
        self.regression_label = [Instance(label=random.random()) for i in range(100)]
        self.regression_inst = session.parallelize(self.regression_label)
        self.classify_checker = ClassifyLabelChecker()
        self.regression_checker = RegressionLabelChecker()
Пример #17
0
 def save_eval_result(self, eval_data):
     LOGGER.info("@ save evaluation result to table with namespace: {0} and name: {1}".format(
         self.workflow_param.evaluation_output_namespace, self.workflow_param.evaluation_output_table))
     session.parallelize([eval_data],
                         include_key=False,
                         name=self.workflow_param.evaluation_output_table,
                         namespace=self.workflow_param.evaluation_output_namespace,
                         error_if_exist=False,
                         persistent=True
                         )
Пример #18
0
    def setUp(self):
        session.init("test_stratified_sampler")
        self.data = []
        self.data_to_trans = []
        for i in range(1000):
            self.data.append((i, Instance(label=i % 4, features=i * i)))
            self.data_to_trans.append((i, Instance(features=i**3)))

        self.table = session.parallelize(self.data, include_key=True)
        self.table_trans = session.parallelize(self.data_to_trans,
                                               include_key=True)
Пример #19
0
    def predict(self, data_inst):
        """
        predicton function. Note that: GMF model use different DataConverter in evaluation and prediction procedure.
        :param data_inst: data instance
        :return: the prediction results
        """
        LOGGER.info(
            f"data_inst type: {type(data_inst)}, size: {data_inst.count()}, table name: {data_inst.get_name()}"
        )
        LOGGER.info(f"current flowid: {self.flowid}")
        if self.flowid == 'validate':
            # use GMFSequenceData in evaluation procedure (after training procedure)
            data = self.data_converter.convert(
                data_inst,
                batch_size=self.batch_size,
                neg_count=self.model_param.neg_count,
                training=True,
                flow_id=self.flowid)
            keys = data.get_keys()
            labels = data.get_validate_labels()
            label_data = fate_session.parallelize(
                zip(keys, labels),
                include_key=True,
                partition=data_inst._partitions)
        else:
            # use GMFSequencePredictData in prediction procedure
            data = self.data_converter.convert(data_inst,
                                               batch_size=self.batch_size,
                                               training=False)
            label_data = data_inst.map(lambda k, v:
                                       (k, v.features.astype(int).tolist()[2]))
        LOGGER.info(f"label_data example: {label_data.take(10)}")
        LOGGER.info(
            f"data example: {data_inst.first()[1].features.astype(int)}")
        LOGGER.info(f"converted data, size :{data.size}")
        predict = self._model.predict(data)
        LOGGER.info(f"predict shape: {predict.shape}")
        threshold = self.params.predict_param.threshold

        kv = [(x[0], (0 if x[1] <= threshold else 1, x[1].item()))
              for x in zip(data.get_keys(), predict)]
        pred_tbl = fate_session.parallelize(kv,
                                            include_key=True,
                                            partition=data_inst._partitions)
        pred_data = label_data.join(
            pred_tbl,
            lambda d, pred: [d, pred[0], pred[1], {
                "label": pred[0]
            }])
        LOGGER.info(f"pred_data sample: {pred_data.take(20)}")
        return pred_data
Пример #20
0
    def gen_data(self, data_num, feature_num, partition):
        data = []
        header = [str(i) for i in range(feature_num)]
        # col_2 = np.random.rand(data_num)
        col_data = []
        for _ in range(feature_num - 1):
            while True:
                col_1 = np.random.rand(data_num)
                if np.mean(col_1) != 0:
                    break
            col_data.append(col_1)
        col_data.append(10 * np.ones(data_num))

        for key in range(data_num):
            data.append((key, np.array([col[key] for col in col_data])))

        result = session.parallelize(data,
                                     include_key=True,
                                     partition=partition)
        result.schema = {'header': header}
        self.header = header

        self.coe_list = []
        for col in col_data:
            self.coe_list.append(np.std(col) / np.mean(col))
        return result
Пример #21
0
    def setUp(self):
        self.data_num = 100
        self.feature_num = 3
        self.cols = [0, 1, 2]
        self.header = ['x' + str(i) for i in range(self.feature_num)]
        final_result = []

        for i in range(self.data_num):
            tmp = []
            for _ in range(self.feature_num):
                tmp.append(np.random.choice([1, 2, 3]))
            tmp = np.array(tmp)
            inst = Instance(inst_id=i, features=tmp, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)

        table = session.parallelize(final_result,
                                    include_key=True,
                                    partition=10)
        table.schema = {"header": self.header}
        self.model_name = 'OneHotEncoder'

        self.table = table

        self.args = {"data": {self.model_name: {"data": table}}}
Пример #22
0
    def setUp(self):
        # eggroll.init("123")
        self.data_num = 1000
        self.feature_num = 200
        self.bin_num = 10
        final_result = []
        numpy_array = []
        for i in range(self.data_num):
            if 100 < i < 500:
                continue
            tmp = i * np.ones(self.feature_num)
            inst = Instance(inst_id=i, features=tmp, label=i % 2)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            numpy_array.append(tmp)
        table = session.parallelize(final_result,
                                    include_key=True,
                                    partition=10)

        header = ['x' + str(i) for i in range(self.feature_num)]

        self.table = table
        self.table.schema = {'header': header}

        self.numpy_table = np.array(numpy_array)
        self.cols = [1, 2]
Пример #23
0
def create_shared_gradient_table(gradients, index_list):
    indexed_instances = []
    for idx, grad in zip(index_list, gradients):
        indexed_instances.append((idx, grad))

    dtable = session.parallelize(indexed_instances, include_key=True)
    return dtable
Пример #24
0
def save_model_parameters(model_parameters, model_table_name, model_namespace):
    dtable = parallelize(model_parameters.items(), include_key=True,
                         name=model_table_name,
                         namespace=model_namespace,
                         error_if_exist=True,
                         persistent=True)
    return dtable
Пример #25
0
    def predict(self, data_inst):
        keys, test_x, test_y = self._load_data(data_inst)
        self.set_partition(data_inst)

        preds = self.model.predict(test_x)

        predict_tb = session.parallelize(zip(keys, preds), include_key=True)
        if self.task_type == "regression":
            result = data_inst.join(predict_tb,
                                    lambda inst, predict: [inst.label, float(predict[0]), float(predict[0]),
                                                           {"label": float(predict[0])}])
        else:
            if self.num_label > 2:
                result = data_inst.join(predict_tb,
                                        lambda inst, predict: [inst.label,
                                                               int(np.argmax(predict)),
                                                               float(np.max(predict)),
                                                               dict([(str(idx), float(predict[idx])) for idx in
                                                                     range(predict.shape[0])])])

            else:
                threshold = self.predict_param.threshold
                result = data_inst.join(predict_tb,
                                        lambda inst, predict: [inst.label,
                                                               1 if predict[0] > threshold else 0,
                                                               float(predict[0]),
                                                               {"0": 1 - float(predict[0]),
                                                                "1": float(predict[0])}])

        return result
Пример #26
0
    def test_sparse_abnormal_data(self):
        final_result = []
        numpy_array = []
        sparse_inst_shape = self.feature_num + 15
        indices = [x for x in range(self.feature_num + 10)]
        for i in range(self.data_num):
            tmp = 100 * np.random.rand(self.feature_num)
            tmp = [ik for ik in range(self.feature_num)]
            tmp[i % self.feature_num] = 'nan'
            # data_index = np.random.choice(indices, self.feature_num, replace=False)
            # data_index = sorted(data_index)
            data_index = [idx for idx in range(self.feature_num)]
            sparse_inst = SparseVector(data_index,
                                       tmp,
                                       shape=sparse_inst_shape)
            if i == 0:
                aa = sparse_inst.get_data(0, 'a')
                print('in for loop: {}, type: {}'.format(aa, type(aa)))
            inst = Instance(inst_id=i, features=sparse_inst, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            n = 0
            pointer = 0
            tmp_array = []
            while n < sparse_inst_shape:
                if n in data_index:
                    tmp_array.append(tmp[pointer])
                    pointer += 1
                else:
                    tmp_array.append(0)
                n += 1
            numpy_array.append(tmp_array)

        abnormal_value = final_result[0][1].features.get_data(0, 'a')
        print('abnormal_value: {}, type: {}'.format(abnormal_value,
                                                    type(abnormal_value)))
        table = session.parallelize(final_result,
                                    include_key=True,
                                    partition=1)
        header = ['x' + str(i) for i in range(sparse_inst_shape)]
        numpy_table = np.array(numpy_array)
        table.schema = {'header': header}
        self.used_data_set.append(table)

        bin_obj = self._bin_obj_generator(abnormal_list=['nan'])
        split_points = bin_obj.fit_split_points(table)
        print('split_points: {}'.format(split_points))
        print(numpy_table)

        trans_result = bin_obj.transform(table,
                                         transform_cols_idx=-1,
                                         transform_type='bin_num')
        trans_result = trans_result.collect()
        print('transform result: ')
        for k, v in trans_result:
            value = v.features.get_all_data()
            value_list = []
            for value_k, value_v in value:
                value_list.append((value_k, value_v))
            print(k, value_list)
Пример #27
0
    def setUp(self):
        self.data = []
        self.max_feature = -1
        for i in range(100):
            row = []
            label = i % 2
            row.append(str(label))
            dict = {}

            for j in range(20):
                x = random.randint(0, 1000)
                val = random.random()
                if x in dict:
                    continue
                self.max_feature = max(self.max_feature, x)
                dict[x] = True
                row.append(":".join(map(str, [x, val])))

            self.data.append((i, " ".join(row)))

        self.table = session.parallelize(self.data,
                                         include_key=True,
                                         partition=16)
        self.args = {"data": {"data_io_0": {"data": self.table}}}

        self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
Пример #28
0
    def setUp(self):
        # eggroll.init("123")
        self.data_num = 10
        self.feature_num = 5
        final_result = []
        numpy_array = []
        for i in range(self.data_num):
            tmp = np.random.rand(self.feature_num)
            inst = Instance(inst_id=i, features=tmp, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            numpy_array.append(tmp)
        table = session.parallelize(final_result,
                                    include_key=True,
                                    partition=10)

        header = ['x' + str(i) for i in range(self.feature_num)]
        self.col_dict = {}
        for idx, h in enumerate(header):
            self.col_dict[h] = idx

        self.table = table
        self.table.schema = {'header': header}
        self.numpy_table = np.array(numpy_array)
        self.cols = [1, 2]
        self.used_data_set = []
Пример #29
0
    def transform(self, instance_table):
        """
        transform instances into features.

        Parameters
        ----------
        :param instance_table: dtable with a collection of (index, instance) pairs
        :return:
        """
        start = time.time()
        LOGGER.debug("@ extract representative features from raw input")

        index_tracking_list = []

        indexed_instances = instance_table.collect()
        features_list = []
        instances_list = []
        for idx, inst in indexed_instances:
            index_tracking_list.append(idx)
            features_list.append(inst.features)
            instances_list.append(inst)
        raw_features = np.array(features_list)
        trans_features = self.model.transform(raw_features)

        indexed_instances = []
        for idx, inst, feat in zip(index_tracking_list, instances_list, trans_features):
            inst.set_feature(feat)
            indexed_instances.append((idx, inst))

        dtable = session.parallelize(indexed_instances, include_key=True, partition=instance_table._partitions)

        end = time.time()
        LOGGER.debug("@ transform time:" + str(end - start))
        return dtable, index_tracking_list
    def federated_find_split(self, dep=-1, batch=-1):
        LOGGER.info("federated find split of depth {}, batch {}".format(dep, batch))
        encrypted_splitinfo_host = self.sync_encrypted_splitinfo_host(dep, batch)

        for i in range(len(encrypted_splitinfo_host)):
            init_gain = self.min_impurity_split - consts.FLOAT_ZERO
            encrypted_init_gain = self.encrypter.encrypt(init_gain)
            best_splitinfo_host = [[-1, encrypted_init_gain] for j in range(len(self.cur_split_nodes))]
            best_gains = [init_gain for j in range(len(self.cur_split_nodes))]
            max_nodes = max(len(encrypted_splitinfo_host[i][j]) for j in range(len(self.cur_split_nodes)))
            for k in range(0, max_nodes, consts.MAX_FEDERATED_NODES):
                batch_splitinfo_host = [encrypted_splitinfo[k: k + consts.MAX_FEDERATED_NODES] for encrypted_splitinfo
                                        in encrypted_splitinfo_host[i]]
                encrypted_splitinfo_host_table = session.parallelize(zip(self.cur_split_nodes, batch_splitinfo_host),
                                                                     include_key=False,
                                                                     partition=self.data_bin._partitions)
                splitinfos = encrypted_splitinfo_host_table.mapValues(self.find_host_split).collect()
                for _, splitinfo in splitinfos:
                    if best_splitinfo_host[_][0] == -1:
                        best_splitinfo_host[_] = list(splitinfo[:2])
                        best_gains[_] = splitinfo[2]
                    elif splitinfo[0] != -1 and splitinfo[2] > best_gains[_]:
                        best_splitinfo_host[_][0] = k + splitinfo[0]
                        best_splitinfo_host[_][1] = splitinfo[1]
                        best_gains[_] = splitinfo[2]

            self.sync_federated_best_splitinfo_host(best_splitinfo_host, dep, batch, i)