def test_distributed_calculate_sum_XY(self): print("--- test_distributed_calculate_sum_XY ---") X = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]) Y = np.array([[1], [-1], [1]]) actual_sum_XY = np.sum(X * Y, axis=0) sum_XY = compute_sum_XY(X, Y) assert_array(actual_sum_XY, sum_XY)
def test_distributed_calculate_avg_XY_1(self): print("--- test_distributed_calculate_avg_XY_1 ---") X = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]) Y = np.array([[1], [-1], [1]]) actual_avg_XY = np.mean(X * Y, axis=0) avg_XY = compute_avg_XY(X, Y) assert_array(actual_avg_XY, avg_XY)
def test_create_table_with_dict(self): row_count = 10 expect_data = np.random.rand(row_count, 10) indexes = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] actual_data = {} dtable = create_table(expect_data, indexes) for item in dtable.collect(): actual_data[item[0]] = item[1] assert dtable.count() == len(indexes) for i, index in enumerate(indexes): assert_array(actual_data[indexes[i]], expect_data[i])
def test_distributed_calculate_avg_XY_2(self): print("--- test_distributed_calculate_avg_XY_2 ---") X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float64) Y = np.array([[1], [-1], [1]]) Y = np.tile(Y, (1, X.shape[-1])) actual1 = np.sum(Y * X, axis=0) / len(Y) actual2 = np.sum(X * Y, axis=0) / len(Y) predict1 = compute_avg_XY(X, Y) predict2 = compute_avg_XY(Y, X) assert_array(actual1, predict1) assert_array(actual2, predict2)
def test_feed_into_dtable(self): ids = list(range(50)) X = np.random.rand(50, 6) y = np.random.rand(50, 1) sample_range = (10, 30) feature_range = (2, 5) expected_sample_number = sample_range[1] - sample_range[0] expected_ids = ids[sample_range[0]: sample_range[1]] expected_X = X[sample_range[0]:sample_range[1], feature_range[0]: feature_range[1]] expected_y = y[sample_range[0]:sample_range[1]] expected_data = {} for i, id in enumerate(expected_ids): expected_data[id] = { "X": expected_X[i], "y": expected_y[i] } data_table = feed_into_dtable(ids, X, y, sample_range, feature_range) val = data_table.collect() data_dict = dict(val) actual_table_size = len(data_dict) assert expected_sample_number == actual_table_size for item in data_dict.items(): id = item[0] inst = item[1] expected_item = expected_data[id] X_i = expected_item["X"] y_i = expected_item["y"] features = inst.features label = inst.label assert_array(X_i, features) assert y_i[0] == label
[-0.879933, 0.420589, -0.877527, -0.780484, -1.037534, -0.48388], [0.426758, 0.723479, 0.316885, 0.287273, 1.000835, 0.962702], [0.963102, 1.467675, 0.829202, 0.772457, -0.038076, -0.468613]]) infile = "../../../../examples/data/unittest_data.csv" ids, X, y = load_data(infile, 0, (2, 8), 1) ids = np.array(ids, dtype=np.int32) X = np.array(X, dtype=np.float64) y = np.array(y, dtype=np.int32) print("ids shape", ids.shape) print("X shape", X.shape) print("y shape", y.shape) assert_array(expected_ids, ids) assert_array(expected_y, y) assert_matrix(expected_X, X) expected_data = {} for i, id in enumerate(expected_ids): expected_data[id] = {"X": expected_X[i], "y": expected_y[i]} init() data_table = feed_into_dtable(ids, X, y.reshape(-1, 1), (0, len(ids)), (0, X.shape[-1])) for item in data_table.collect(): id = item[0] inst = item[1] expected_item = expected_data[id] X_i = expected_item["X"]
def test_create_n_guest_generators(self): X = np.random.rand(600, 33) y = np.random.rand(600, 1) overlap_ratio = 0.2 guest_split_ratio = 0.3 guest_feature_num = 16 data_size = X.shape[0] overlap_size = int(data_size * overlap_ratio) expected_overlap_indexes = np.array(range(overlap_size)) particular_guest_size = int((data_size - overlap_size) * guest_split_ratio) expected_guest_size = overlap_size + particular_guest_size expected_host_size = overlap_size + data_size - expected_guest_size guest_data_generator, host_data_generator, overlap_indexes = \ create_guest_host_data_generator(X, y, overlap_ratio=overlap_ratio, guest_split_ratio=guest_split_ratio, guest_feature_num=guest_feature_num) guest_features_dict = {} guest_labels_dict = {} guest_instances_indexes = [] guest_count = 0 guest_feature_num = 0 for item in guest_data_generator: key = item[0] instance = item[1] guest_feature_num = instance.features.shape[-1] guest_count += 1 guest_instances_indexes.append(key) guest_features_dict[key] = instance.features guest_labels_dict[key] = instance.label host_features_dict = {} host_labels_dict = {} host_instances_indexes = [] host_count = 0 host_feature_num = 0 for item in host_data_generator: key = item[0] instance = item[1] host_feature_num = instance.features.shape[-1] host_count += 1 host_instances_indexes.append(key) host_features_dict[key] = instance.features host_labels_dict[key] = instance.label assert_array(expected_overlap_indexes, overlap_indexes) assert len(expected_overlap_indexes) == len(overlap_indexes) assert X.shape[-1] == guest_feature_num + host_feature_num assert expected_guest_size == guest_count assert expected_host_size == host_count for index in overlap_indexes: assert guest_labels_dict[index] == host_labels_dict[index] assert guest_labels_dict[index] == y[index] assert_matrix(guest_features_dict[index], X[index, :guest_feature_num].reshape(1, -1)) assert_matrix(host_features_dict[index], X[index, guest_feature_num:].reshape(1, -1))