def setUp(self): eggroll.init("test_instance") dense_inst = [] for i in range(100): inst = Instance(features=(i % 16 * np.ones(20))) dense_inst.append((i, inst)) self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=2) sparse_inst = [] col_zero = [] for i in range(100): indices = [] data = [] for j in range(20): val = ((i + 5)**3 + (j + 1)**4) % 16 if val > 0: indices.append(j) data.append(val) if j == 0: col_zero.append(val) sparse_vec = SparseVector(indices, data, 20) inst = Instance(features=sparse_vec) sparse_inst.append((i, inst)) self.sparse_inst = sparse_inst self.sparse_table = eggroll.parallelize(sparse_inst, include_key=True, partition=1)
def setUp(self): eggroll.init("test_instance") dense_inst = [] headers = ['x' + str(i) for i in range(20)] for i in range(100): inst = Instance(features=(i % 16 * np.ones(20))) dense_inst.append((i, inst)) self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=2) self.dense_table.schema = {'header': headers} self.sparse_inst = [] for i in range(100): dict = {} indices = [] data = [] for j in range(20): idx = random.randint(0, 29) if idx in dict: continue dict[idx] = 1 val = random.random() indices.append(idx) data.append(val) sparse_vec = SparseVector(indices, data, 30) self.sparse_inst.append((i, Instance(features=sparse_vec))) self.sparse_table = eggroll.parallelize(self.sparse_inst, include_key=True) self.sparse_table.schema = { "header": ["fid" + str(i) for i in range(30)] }
def _compensate_set_difference(self, original_data, data_output): self.coverage = data_output.count() / original_data.count() import copy schema = copy.deepcopy(original_data.schema) if self.need_label: original_data = original_data.mapValues(lambda v: Instance(label="unretrieved", features=[], inst_id=v.inst_id)) else: feature_count = len(self.target_cols) features = np.array(["unretrieved"] * feature_count) original_data = original_data.mapValues(lambda v: Instance(features=features, inst_id=v.inst_id)) # LOGGER.debug(f"original data features is {list(original_data.collect())[0][1].features}") # LOGGER.debug(f"original data label is {list(original_data.collect())[0][1].label}") data_output = original_data.union(data_output, lambda v, u: u) # LOGGER.debug(f"data_output features after union is {list(data_output.collect())[0][1].features}") # LOGGER.debug(f"data_output label after union is {list(data_output.collect())[0][1].label}") if self.need_label: schema["label_name"] = "retrieved_value" schema["header"] = [] data_output.schema = schema else: schema["label_name"] = None schema["header"] = self.target_cols data_output.schema = schema self._sync_coverage(original_data) return data_output
def gen_data(self, data_num, feature_num, partition, is_sparse=False, use_random=False): data = [] shift_iter = 0 header = [str(i) for i in range(feature_num)] for data_key in range(data_num): value = data_key % bin_num if value == 0: if shift_iter % bin_num == 0: value = bin_num - 1 shift_iter += 1 if not is_sparse: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) inst = Instance(inst_id=data_key, features=features, label=data_key % 2) else: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) data_index = [x for x in range(feature_num)] sparse_inst = SparseVector(data_index, data=features, shape=10 * feature_num) inst = Instance(inst_id=data_key, features=sparse_inst, label=data_key % 2) header = [str(i) for i in range(feature_num * 10)] data.append((data_key, inst)) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} return result
def split_into_guest_host_dtable(X, y, overlap_ratio=0.2, guest_split_ratio=0.5, guest_feature_num=16, tables_name=None, partition=1): data_size = X.shape[0] overlap_size = int(data_size * overlap_ratio) overlap_indexes = np.array(range(overlap_size)) guest_size = int((data_size - overlap_size) * guest_split_ratio) guest_table_ns = "guest_table_ns" guest_table_name = "guest_table_name" host_table_ns = "host_table_ns" host_table_name = "host_table_name" if tables_name is not None: guest_table_ns = tables_name["guest_table_ns"] guest_table_name = tables_name["guest_table_name"] host_table_ns = tables_name["host_table_ns"] host_table_name = tables_name["host_table_name"] guest_temp = [] for i in range(0, overlap_size + guest_size): guest_temp.append( (i, Instance(inst_id=None, weight=1.0, features=X[i, :guest_feature_num].reshape(1, -1), label=y[i, 0]))) guest_data = table(name=guest_table_name, namespace=guest_table_ns, partition=partition) guest_data.put_all(guest_temp) host_temp = [] for i in range(0, overlap_size): host_temp.append( (i, Instance(inst_id=None, weight=1.0, features=X[i, guest_feature_num:].reshape(1, -1), label=y[i, 0]))) for i in range(overlap_size + guest_size, len(X)): host_temp.append( (i, Instance(inst_id=None, weight=1.0, features=X[i, guest_feature_num:].reshape(1, -1), label=y[i, 0]))) host_data = table(name=host_table_name, namespace=host_table_ns, partition=partition) host_data.put_all(host_temp) return guest_data, host_data, overlap_indexes
def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() self.gradient_operator = LogisticGradient() self.taylor_operator = TaylorLogisticGradient() self.X = np.array([[1, 2, 3, 4, 5], [3, 2, 4, 5, 1], [ 2, 2, 3, 1, 1, ]]) / 10 self.X1 = np.c_[self.X, np.ones(3)] self.Y = np.array([[1], [1], [-1]]) self.values = [] for idx, x in enumerate(self.X): inst = Instance(inst_id=idx, features=x, label=self.Y[idx]) self.values.append((idx, inst)) self.values1 = [] for idx, x in enumerate(self.X1): inst = Instance(inst_id=idx, features=x, label=self.Y[idx]) self.values1.append((idx, inst)) self.coef = np.array([2, 2.3, 3, 4, 2.1]) / 10 self.coef1 = np.append(self.coef, [1])
def setUp(self): session.init("test_label_checker") self.small_label_set = [Instance(label=i % 5) for i in range(100)] self.classify_inst = session.parallelize(self.small_label_set, include_key=False) self.regression_label = [Instance(label=random.random()) for i in range(100)] self.regression_inst = session.parallelize(self.regression_label) self.classify_checker = ClassifyLabelChecker() self.regression_checker = RegressionLabelChecker()
def _gen_data(self, data_num, feature_num, partition, expect_ratio, is_sparse=False, use_random=False): data = [] shift_iter = 0 header = [str(i) for i in range(feature_num)] # bin_num = 3 label_count = {} # expect_ratio = { # 0: (1, 9), # 1: (1, 1), # 2: (9, 1) # } bin_num = len(expect_ratio) for data_key in range(data_num): value = data_key % bin_num if value == 0: if shift_iter % bin_num == 0: value = bin_num - 1 shift_iter += 1 if not is_sparse: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) label = self.__gen_label(value, label_count, expect_ratio) inst = Instance(inst_id=data_key, features=features, label=label) else: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) data_index = [x for x in range(feature_num)] sparse_inst = SparseVector(data_index, data=features, shape=10 * feature_num) label = self.__gen_label(value, label_count, expect_ratio) inst = Instance(inst_id=data_key, features=sparse_inst, label=label) header = [str(i) for i in range(feature_num * 10)] data.append((data_key, inst)) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} self.table_list.append(result) return result
def setUp(self): eggroll.init("test_stratified_sampler") self.data = [] self.data_to_trans = [] for i in range(1000): self.data.append((i, Instance(label=i % 4, features=i * i))) self.data_to_trans.append((i, Instance(features = i ** 3))) self.table = eggroll.parallelize(self.data, include_key=True) self.table_trans = eggroll.parallelize(self.data_to_trans, include_key=True)
def _merge_instance(id_map1, id_map2, need_label): """ :param id_map1: (a, b) :param id_map2: (a, c) :return: (c, b) """ merge_table = id_map1.join(id_map2, lambda v, u: (u, v)) if need_label: return merge_table.map(lambda k, v: (v[0], Instance(label=v[1], features=[]))) else: return merge_table.map(lambda k, v: (v[0], Instance(features=v[1])))
def _gen_data(self, data_num, feature_num, partition, expect_split_points, is_sparse=False, use_random=False): data = [] shift_iter = 0 header = [str(i) for i in range(feature_num)] bin_num = len(expect_split_points) for data_key in range(data_num): value = expect_split_points[data_key % bin_num] if value == expect_split_points[-1]: if shift_iter % bin_num == 0: value = expect_split_points[0] shift_iter += 1 if not is_sparse: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) inst = Instance(inst_id=data_key, features=features, label=data_key % 2) else: if not use_random: features = value * np.ones(feature_num) else: features = np.random.random(feature_num) data_index = [x for x in range(feature_num)] sparse_inst = SparseVector(data_index, data=features, shape=feature_num) inst = Instance(inst_id=data_key, features=sparse_inst, label=data_key % 2) header = [str(i) for i in range(feature_num)] data.append((data_key, inst)) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} self.table_list.append(result) return result
def gen_data(self, data_num, feature_num, partition): data = [] header = [str(i) for i in range(feature_num)] # col_2 = np.random.rand(data_num) col_data = [] for _ in range(feature_num - 1): while True: col_1 = np.random.rand(data_num) if np.mean(col_1) != 0: break col_data.append(col_1) col_data.append(10 * np.ones(data_num)) for key in range(data_num): data.append( (key, Instance(features=np.array([col[key] for col in col_data])))) result = session.parallelize(data, include_key=True, partition=partition) result.schema = {'header': header} self.header = header self.coe_list = [] for col in col_data: self.coe_list.append(np.std(col) / np.mean(col)) return result
def _gen_table_data(self): if self._dense_table is not None: return self._dense_table, self._dense_not_inst_table, self._original_data headers = ['x' + str(i) for i in range(self.feature_num)] dense_inst = [] dense_not_inst = [] original_data = 100 * np.random.random((self.count, self.feature_num)) # original_data = 100 * np.zeros((self.count, self.feature_num)) for i in range(self.count): features = original_data[i, :] inst = Instance(features=features) dense_inst.append((i, inst)) dense_not_inst.append((i, features)) dense_table = session.parallelize(dense_inst, include_key=True, partition=16) dense_not_inst_table = session.parallelize(dense_not_inst, include_key=True, partition=16) dense_table.schema = {'header': headers} dense_not_inst_table.schema = {'header': headers} self._dense_table, self._dense_not_inst_table, self._original_data = \ dense_table, dense_not_inst_table, original_data return dense_table, dense_not_inst_table, original_data
def fit(self, data_inst): """ 开始正式处理数据 """ self._abnormal_detection(data_inst) data_instances = data_inst.mapValues(self.load_data) LOGGER.info("开始归一化数据") LOGGER.info("开始计算sum_square_x_host的结果") sum_square_x_host = data_instances.mapValues( lambda x: np.sum(np.power(x.features, 2))) LOGGER.info("将sum_square_x_host发送给guest方") self.transfer_variable.host_to_guest.remote(obj=sum_square_x_host, role=consts.GUEST, idx=-1, suffix=(0, 0)) LOGGER.info("从guest方接收结果norm_x") self.norm_x = self.transfer_variable.guest_to_host.get(idx=-1, suffix=(1, 1)) LOGGER.info("开始归一化数据") self.data_output = data_inst.join( self.norm_x[0], lambda x, y: Instance(features=np.true_divide(x.features, y), label=x.label)) return self.data_output
def fit(self, data_inst): """ 正式开始处理数据 """ self._abnormal_detection(data_inst) data_instances = data_inst.mapValues(self.load_data) LOGGER.info("开始归一化数据") LOGGER.info("开始计算sum_square_x_guest的结果") sum_square_x_guest = data_instances.mapValues( lambda x: np.sum(np.power(x.features, 2))) LOGGER.info("从host方接收sum_square_x_host的值") sum_square_x_host = self.transfer_variable.host_to_guest.get( idx=-1, suffix=(0, 0)) LOGGER.info("开始求平方根norm_x的值") self.norm_x = sum_square_x_guest.join(sum_square_x_host[0], lambda g, h: (g + h)**0.5) LOGGER.info("将norm_x发送给对方") self.transfer_variable.guest_to_host.remote(self.norm_x, role=consts.HOST, idx=-1, suffix=(1, 1)) LOGGER.info("正式归一化") self.data_output = data_inst.join( self.norm_x, lambda x, y: Instance(features=np.true_divide(x.features, y), label=x.label)) return self.data_output
def setUp(self): # eggroll.init("123") self.data_num = 1000 self.feature_num = 200 self.bin_num = 10 final_result = [] numpy_array = [] for i in range(self.data_num): if 100 < i < 500: continue tmp = i * np.ones(self.feature_num) inst = Instance(inst_id=i, features=tmp, label=i % 2) tmp_pair = (str(i), inst) final_result.append(tmp_pair) numpy_array.append(tmp) table = session.parallelize(final_result, include_key=True, partition=10) header = ['x' + str(i) for i in range(self.feature_num)] self.table = table self.table.schema = {'header': header} self.numpy_table = np.array(numpy_array) self.cols = [1, 2]
def initialize(y): y_inst = y.mapValues( lambda label: Instance(features=np.asarray([label]))) y_inst.schema = {"header": ["label"]} statistics = MultivariateStatisticalSummary(y_inst, -1) mean = statistics.get_mean()["label"] return y.mapValues(lambda x: np.asarray([mean])), np.asarray([mean])
def feed_into_dtable(ids, X, y, sample_range, feature_range, tables_name=None, partition=1): """ Create an eggroll table feed with data specified by parameters provided parameters ---------- :param ids: 1D numpy array :param X: 2D numpy array :param y: 2D numpy array :param sample_range: a tuple specifies the range of samples to feed into dtable :param feature_range: a tuple specifies the range of features to feed into dtable :param tables_name: a dictionary specifies table namespace (with key table_ns) and table name (with key table_name) :param partition: number of partition used when creating the dtable :return: an eggroll dtable """ table_ns = "default_table_namespace" table_name = get_timestamp() if tables_name is not None: table_ns = tables_name["table_ns"] table_name = tables_name["table_name"] sample_list = [] for i in range(sample_range[0], sample_range[1]): sample_list.append((ids[i], Instance(inst_id=ids[i], features=X[i, feature_range[0]:feature_range[1]], label=y[i, 0]))) data_table = table(name=table_name, namespace=table_ns, partition=partition) data_table.put_all(sample_list) return data_table
def fit(self, data_inst): LOGGER.info("begin to make guest data") self._init_data(data_inst) LOGGER.info("split data into multiple random parts") self.secure() LOGGER.info("share one random part data to multiple hosts") self.sync_share_to_host() LOGGER.info("get share of one random part data from multiple hosts") self.recv_share_from_host() LOGGER.info("begin to get sum of multiple party") self.sub_key_sum() LOGGER.info("receive host sum from host") self.recv_host_sum_from_host() self.reconstruct() LOGGER.info("success to calculate privacy sum") self.secret_sum = self.secret_sum.join(data_inst, lambda s, v: Instance(features=numpy.array(s), inst_id=v.inst_id)) self.secret_sum.schema = self.output_schema data_output = self.secret_sum return data_output
def setUp(self): # eggroll.init("123") self.data_num = 1000 self.feature_num = 20 final_result = [] numpy_array = [] for i in range(self.data_num): tmp = np.random.rand(self.feature_num) inst = Instance(inst_id=i, features=tmp, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) numpy_array.append(tmp) table = eggroll.parallelize(final_result, include_key=True, partition=10) header = ['x' + str(i) for i in range(self.feature_num)] self.col_dict = {} for idx, h in enumerate(header): self.col_dict[h] = idx self.table = table self.table.schema = {'header': header} self.numpy_table = np.array(numpy_array) self.cols = [1, 2] self.used_data_set = []
def trans_sparse(instance): dense_features = instance.features indices = [i for i in range(len(dense_features))] sparse_features = SparseVector(indices=indices, data=dense_features, shape=len(dense_features)) return Instance(inst_id=None, features=sparse_features, label=instance.label)
def test_sparse_abnormal_data(self): final_result = [] numpy_array = [] sparse_inst_shape = self.feature_num + 15 indices = [x for x in range(self.feature_num + 10)] for i in range(self.data_num): tmp = 100 * np.random.rand(self.feature_num) tmp = [ik for ik in range(self.feature_num)] tmp[i % self.feature_num] = 'nan' # data_index = np.random.choice(indices, self.feature_num, replace=False) # data_index = sorted(data_index) data_index = [idx for idx in range(self.feature_num)] sparse_inst = SparseVector(data_index, tmp, shape=sparse_inst_shape) if i == 0: aa = sparse_inst.get_data(0, 'a') print('in for loop: {}, type: {}'.format(aa, type(aa))) inst = Instance(inst_id=i, features=sparse_inst, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) n = 0 pointer = 0 tmp_array = [] while n < sparse_inst_shape: if n in data_index: tmp_array.append(tmp[pointer]) pointer += 1 else: tmp_array.append(0) n += 1 numpy_array.append(tmp_array) abnormal_value = final_result[0][1].features.get_data(0, 'a') print('abnormal_value: {}, type: {}'.format(abnormal_value, type(abnormal_value))) table = session.parallelize(final_result, include_key=True, partition=1) header = ['x' + str(i) for i in range(sparse_inst_shape)] numpy_table = np.array(numpy_array) table.schema = {'header': header} self.used_data_set.append(table) bin_obj = self._bin_obj_generator(abnormal_list=['nan']) split_points = bin_obj.fit_split_points(table) print('split_points: {}'.format(split_points)) print(numpy_table) trans_result = bin_obj.transform(table, transform_cols_idx=-1, transform_type='bin_num') trans_result = trans_result.collect() print('transform result: ') for k, v in trans_result: value = v.features.get_all_data() value_list = [] for value_k, value_v in value: value_list.append((value_k, value_v)) print(k, value_list)
def to_instance(self, features, label=None): if self.header is None and len(features) != 0: raise ValueError( "features shape {} not equal to header shape 0".format( len(features))) elif self.header is not None and len(self.header) != len(features): raise ValueError( "features shape {} not equal to header shape {}".format( len(features), len(self.header))) if self.label_idx is not None: if self.label_type == 'int': label = int(label) elif self.label_type in ["float", "float64"]: label = float(label) format_features = DenseFeatureTransformer.gen_output_format( features, self.data_type, self.exclusive_data_type_fid_map, self.output_format, missing_impute=self.missing_impute) else: format_features = DenseFeatureTransformer.gen_output_format( features, self.data_type, self.exclusive_data_type_fid_map, self.output_format, missing_impute=self.missing_impute) return Instance(inst_id=None, features=format_features, label=label)
def setUp(self): self.feature_histogram = FeatureHistogram() eggroll.init("test_feature_histogram") data_insts = [] for i in range(1000): indices = [] data = [] for j in range(10): x = random.randint(0, 5) if x != 0: data.append(x) indices.append(j) sparse_vec = SparseVector(indices, data, shape=10) data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3)))) self.node_map = {0: 0, 1: 1, 2: 2, 3: 3} self.data_insts = data_insts self.data_bin = eggroll.parallelize(data_insts, include_key=False) self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)] self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False) bin_split_points = [] for i in range(10): bin_split_points.append(np.array([i for i in range(5)])) self.bin_split_points = np.array(bin_split_points) self.bin_sparse = [0 for i in range(10)]
def setUp(self): self.data_num = 1000 self.feature_num = 3 self.cols = [0, 1, 2, 3] self.header = ['x' + str(i) for i in range(self.feature_num)] final_result = [] for i in range(self.data_num): tmp = [] for _ in range(self.feature_num): tmp.append(np.random.choice([1, 2, 3, 'test_str'])) tmp = np.array(tmp) inst = Instance(inst_id=i, features=tmp, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) table = session.parallelize(final_result, include_key=True, partition=10) table.schema = {"header": self.header} self.model_name = 'OneHotEncoder' self.table = table self.args = {"data": {self.model_name: {"data": table}}}
def setUp(self): eggroll.init("test_instance") dense_inst = [] dense_not_inst = [] headers = ['x' + str(i) for i in range(20)] self.header = headers self.eps = 1e-5 self.count = 100 self.dense_data_transpose = [] for i in range(self.count): features = i % 16 * np.ones(20) inst = Instance(features=features) dense_inst.append((i, inst)) self.dense_data_transpose.append(features) dense_not_inst.append((i, features)) self.dense_inst = dense_inst self.dense_not_inst = dense_not_inst self.dense_data_transpose = np.array(self.dense_data_transpose) self.dense_data_transpose = self.dense_data_transpose.transpose() self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=5) self.dense_not_inst_table = eggroll.parallelize(dense_not_inst, include_key=True, partition=5) self.dense_table.schema = {'header': headers} self.dense_not_inst_table.schema = {'header': headers} col_index = [1, 2, 3] self.col_index = col_index self.summary_obj = MultivariateStatisticalSummary(self.dense_table, col_index, abnormal_list=[None]) self.summary_obj_not_inst = MultivariateStatisticalSummary(self.dense_not_inst_table, col_index, abnormal_list=[None])
def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) size = 10 self.wx = eggroll.parallelize( [self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = eggroll.parallelize( [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)]) self.w = [i for i in range(size)] self.data_inst = eggroll.parallelize([ Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2)) for i in range(size) ], partition=1) # test fore_gradient self.fore_gradient_local = [ -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75 ] # test gradient self.gradient = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.gradient_fit_intercept = [ 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125 ] self.loss = 4.505647
def setUp(self): self.paillier_encrypt = PaillierEncrypt() self.paillier_encrypt.generate_key() # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt) self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest() size = 10 self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)], partition=48, include_key=False) # self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)]) self.en_sum_wx_square = session.parallelize([self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)], partition=48, include_key=False) self.wx = np.array([i for i in range(size)]) self.w = self.wx / np.array([1 for _ in range(size)]) self.data_inst = session.parallelize( [Instance(features=np.array([1 for _ in range(size)]), label=pow(-1, i % 2)) for i in range(size)], partition=48, include_key=False) # test fore_gradient self.fore_gradient_local = [-0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75] # test gradient self.gradient = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125] self.gradient_fit_intercept = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125] self.loss = 4.505647
def test_replace_predict_label(self): true_label, predict_label, predict_score, predict_detail, predict_type = 1, 0, 0.1, {"1": 0.1, "0": 0.9}, "train" predict_result = Instance(inst_id=0, features=[true_label, predict_label, predict_score, predict_detail, predict_type]) r_predict_instance = self.label_transformer_obj.replace_predict_label(predict_result, self.predict_label_encoder) r_predict_result = r_predict_instance.features c_predict_result = ["yes", "no", predict_score, {"yes": 0.1, "no": 0.9}, predict_type] self.assertEqual(r_predict_result, c_predict_result)
def test_instance(self): inst = Instance(inst_id=5, weight=2.0, features=[1, 2, 3], label=-5) self.assertTrue(inst.inst_id == 5 and abs(inst.weight - 2.0) < 1e-8 and inst.features == [1, 2, 3] and inst.label == -5) inst.set_weight(3) inst.set_label(5) inst.set_feature(["yes", "no"]) self.assertTrue(inst.weight == 3 and inst.label == 5 and inst.features == ["yes", "no"])