class TestMasking(unittest.TestCase): def setUp(self): self.y = np.array([1, 2, 3, 4, 5]) self.X = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 1, 1, 1], "c": [1, 2, 3, 3, 4]}) self.X = format_covariate_matrix(self.X) self.mask = np.array([True, True, False, False, False]) self.data = Data(self.X, self.y, self.mask, normalize=False) def test_y_sum(self): self.assertEqual(self.data.y.summed_y(), 12) def test_updating_y_sum(self): self.data.update_y(self.y * 2) self.assertEqual(self.data.y.summed_y(), 24) def test_n_obsv(self): self.assertEqual(self.data.X.n_obsv, 3) def test_updating_mask(self): from bartpy.splitcondition import SplitCondition from operator import le s = SplitCondition(0, 4, le) updated_data = self.data + s self.assertListEqual(list(updated_data.mask), [True, True, False, False, True]) self.assertListEqual(list(updated_data.X.mask), [True, True, False, False, True]) self.assertListEqual(list(updated_data.y._mask), [True, True, False, False, True]) self.assertEqual(updated_data.X.n_obsv, 2) self.assertEqual(updated_data.X._n_obsv, 2) self.assertEqual(updated_data.y.summed_y(), 7)
def setUp(self): self.y = np.array([1, 2, 3, 4, 5]) self.X = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [1, 1, 1, 1, 1], "c": [1, 2, 3, 3, 4] }) self.X = format_covariate_matrix(self.X) self.data = Data(self.X, self.y, normalize=True)
def __init__(self, data: Data, split_conditions: List[SplitCondition]=None, combined_condition=None): if split_conditions is None: split_conditions = [] self._data = Data(data.X, deepcopy(data.y), cache=False, unique_columns=data.unique_columns) self._conditions = split_conditions self._combined_condition = combined_condition self._conditioned_X = self._data.X[self.condition()] self._conditioned_data = Data(self._conditioned_X, self._data._y[self.condition()], unique_columns=data.unique_columns) self._combined_conditioner = None
def setUp(self): self.data = Data(pd.DataFrame({"a": [1, 2]}).values, np.array([1, 1])) self.d = LeafNode(Split(self.data)) self.e = LeafNode(Split(self.data)) self.c = DecisionNode(Split(self.data), self.d, self.e) self.b = LeafNode(Split(self.data)) self.a = DecisionNode(Split(self.data), self.b, self.c) self.tree = Tree([self.a, self.b, self.c, self.d, self.e])
class TestData(unittest.TestCase): def setUp(self): self.y = np.array([1, 2, 3, 4, 5]) self.X = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 1, 1, 1], "c": [1, 2, 3, 3, 4]}) self.data = Data(self.X, self.y, normalize=True) def test_unnormalization(self): self.assertListEqual(list(self.data.unnormalized_y), list(self.y)) self.assertListEqual(list(self.data.unnormalize_y(np.array([0, 0.25, 0.5, 0.75]))), [3, 4, 5, 6]) def test_unique_proportion_of_value_in_variable(self): self.assertEqual(self.data.proportion_of_value_in_variable(0, 1), 0.2) def test_non_unique_proportion_of_value_in_variable(self): self.assertEqual(self.data.proportion_of_value_in_variable(2, 1), 0.2) self.assertEqual(self.data.proportion_of_value_in_variable(2, 3), 0.4) def test_unique_columns(self): self.assertEqual(self.data.unique_columns, [0]) def test_covariates_stored_as_matrix(self): self.assertEqual(type(self.data.X), np.ndarray) def test_is_not_constant(self): self.assertTrue(is_not_constant(np.array([1, 1, 2, 3]))) self.assertFalse(is_not_constant(np.array([1, 1, 1, 1]))) def test_n_obsv(self): self.assertEqual(self.data.n_obsv, 5) def test_normalization(self): self.assertEqual(-0.5, self.data.y.min()) self.assertEqual(0.5, self.data.y.max()) def test_splittable_variables(self): self.assertListEqual(list(self.data.splittable_variables()), [0, 2]) def test_random_splittable_value(self): for a in range(10000): self.assertIn(self.data.random_splittable_value(0), [1, 2, 3, 4]) self.assertIsNone(self.data.random_splittable_value(1)) def test_random_splittable_variable(self): for a in range(100): self.assertIn(self.data.random_splittable_variable(), [0, 2]) self.filtered_data = Data(self.data.X[:,[1]], self.data.y) with self.assertRaises(NoSplittableVariableException): self.filtered_data.random_splittable_variable() def test_n_splittable_variables(self): self.assertEqual(self.data.n_splittable_variables, 2) def test_variables(self): self.assertEqual(self.data.variables, [0, 1, 2])
def test_single_condition_data(self): data = Data(pd.DataFrame({"a": [1, 2]}).values, np.array([1, 2])) left_condition, right_condition = SplitCondition(0, 1, le), SplitCondition( 0, 1, gt) left_split, right_split = Split(data) + left_condition, Split( data) + right_condition self.assertListEqual([1], list(left_split.data.X[:, 0])) self.assertListEqual([2], list(right_split.data.X[:, 0]))
def _convert_covariates_to_data(self, X: Union[np.ndarray, pd.DataFrame], y: np.ndarray) -> Data: from copy import deepcopy if type(X) == pd.DataFrame: self.columns = X.columns X = X.values else: self.columns = list(map(str, range(X.shape[1]))) return Data(deepcopy(X), deepcopy(y), normalize=True)
def setUp(self): self.data = Data(format_covariate_matrix(pd.DataFrame({"a": [1]})), np.array([1]).astype(float)) self.d = LeafNode(Split(self.data), None) self.e = LeafNode(Split(self.data), None) self.c = DecisionNode(Split(self.data), self.d, self.e) self.b = LeafNode(Split(self.data)) self.a = DecisionNode(Split(self.data), self.b, self.c) self.tree = Tree([self.a, self.b, self.c, self.d, self.e])
def _convert_covariates_to_data(X: np.ndarray, y: np.ndarray) -> Data: from copy import deepcopy if type(X) == pd.DataFrame: X: pd.DataFrame = X X = X.values return Data(deepcopy(X), deepcopy(y), mask=np.zeros_like(X).astype(bool), normalize=True)
def setUp(self): self.X = format_covariate_matrix(pd.DataFrame({"a": [1, 2, 3, 4, 5]})) self.raw_y = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) self.data = Data(format_covariate_matrix(self.X), self.raw_y, normalize=True) normalizing_scale = self.data.y.normalizing_scale self.model = Model(self.data, Sigma(0.001, 0.001, scaling_factor=normalizing_scale), n_trees=2, initializer=None) self.model.initialize_trees()
def test_combined_condition_data(self): data = Data( pd.DataFrame({ "a": [1, 2, 3, 4] }).values, np.array([1, 2, 1, 1])) first_left_condition, first_right_condition = SplitCondition( 0, 3, le), SplitCondition(0, 3, gt) second_left_condition, second_right_condition = SplitCondition( 0, 1, le), SplitCondition(0, 1, gt) split = Split(data) updated_split = split + first_left_condition + second_right_condition conditioned_data = updated_split.data self.assertListEqual([2, 3], list(conditioned_data.X[:, 0]))
def test_most_recent_split(self): data = Data( pd.DataFrame({ "a": [1, 2, 3, 4] }).values, np.array([1, 2, 1, 1])) first_left_condition, first_right_condition = SplitCondition( 0, 3, le), SplitCondition(0, 3, gt) second_left_condition, second_right_condition = SplitCondition( 0, 1, le), SplitCondition(0, 1, gt) split = Split(data) updated_split = split + first_left_condition + second_right_condition self.assertEqual( (split + first_left_condition).most_recent_split_condition(), first_left_condition) self.assertEqual(updated_split.most_recent_split_condition(), second_right_condition)
class TestDataNormalization(unittest.TestCase): def setUp(self): self.y = np.array([1, 2, 3, 4, 5]) self.X = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [1, 1, 1, 1, 1], "c": [1, 2, 3, 3, 4] }) self.X = format_covariate_matrix(self.X) self.data = Data(self.X, self.y, normalize=True) def test_unnormalization(self): self.assertListEqual(list(self.data.unnormalized_y), list(self.y)) self.assertListEqual( list(self.data.unnormalize_y(np.array([0, 0.25, 0.5, 0.75]))), [3, 4, 5, 6]) def test_normalization(self): self.assertEqual(-0.5, self.data.y.min()) self.assertEqual(0.5, self.data.y.max())
def setUp(self): self.data = Data( pd.DataFrame({ "a": [1, 2, 3], "b": [1, 2, 3] }).values, np.array([1, 2, 3])) self.a = split_node(LeafNode(Split( self.data)), (SplitCondition(0, 1, le), SplitCondition(0, 1, gt))) self.b = self.a.left_child self.x = self.a.right_child self.tree = Tree([self.a, self.b, self.x]) self.c = split_node( self.a._right_child, (SplitCondition(1, 2, le), SplitCondition(1, 2, gt))) mutate(self.tree, TreeMutation("grow", self.x, self.c)) self.d = self.c.left_child self.e = self.c.right_child
def setUp(self): X = format_covariate_matrix( pd.DataFrame({ "a": [1, 2, 3], "b": [1, 2, 3] })) self.data = Data(X, np.array([1, 2, 3]).astype(float)) self.a = split_node(LeafNode(Split( self.data)), (SplitCondition(0, 1, le), SplitCondition(0, 1, gt))) self.b = self.a.left_child self.x = self.a.right_child self.tree = Tree([self.a, self.b, self.x]) self.c = split_node( self.a._right_child, (SplitCondition(1, 2, le), SplitCondition(1, 2, gt))) mutate(self.tree, TreeMutation("grow", self.x, self.c)) self.d = self.c.left_child self.e = self.c.right_child
class TestDataCaching(unittest.TestCase): def setUp(self): self.y = np.array([1, 2, 3, 4, 5]) self.X = pd.DataFrame({ "a": [1, 2, 3, 4, 5], "b": [1, 1, 1, 1, 1], "c": [1, 2, 3, 3, 4] }) self.X = format_covariate_matrix(self.X) self.data = Data(self.X, self.y, normalize=False) def test_summed_y(self): self.assertEqual(self.data.summed_y(), np.sum(self.y)) self.data.update_y(np.array(self.y * 2)) self.assertEqual(self.data.summed_y(), np.sum(self.y) * 2) def test_y(self): self.assertListEqual(list(self.data.y.data), list(self.y)) updated_y = np.array(self.y * 2) self.data.update_y(updated_y) self.assertListEqual(list(self.data.y.data), list(updated_y))
def out_of_sample_condition(self, X: np.ndarray): data = Data(X, np.array([0] * len(X)), cache=False) return self.out_of_sample_conditioner().condition(X)
def test_random_splittable_variable(self): for a in range(100): self.assertIn(self.data.random_splittable_variable(), [0, 2]) self.filtered_data = Data(self.data.X[:,[1]], self.data.y) with self.assertRaises(NoSplittableVariableException): self.filtered_data.random_splittable_variable()
def setUp(self): self.X = format_covariate_matrix(pd.DataFrame({"a": [1, 2, 3, 4, 5]})) self.data = Data(format_covariate_matrix(self.X), np.array([1.0, 2.0, 3.0, 4.0, 5.0])) self.split = Split(self.data) self.node = LeafNode(self.split)
def setUp(self): self.X = format_covariate_matrix(pd.DataFrame({"a": [1]})) self.data = Data(format_covariate_matrix(self.X), np.array([1.0]))
def setUp(self): self.data = Data(pd.DataFrame({"a": [1]}).values, np.array([1]))
def test_null_split_returns_all_values(self): data = Data(pd.DataFrame({"a": [1, 2]}).values, np.array([1, 2])) split = Split(data) conditioned_data = split.data self.assertListEqual(list(data.X[:, 0]), list(conditioned_data.X[:, 0]))