Пример #1
0
    def test_leaf_categorical(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([20, 20], np.eye(2), 500),
                np.random.multivariate_normal([10, 10], np.eye(2), 500),
                np.random.multivariate_normal([1, 1], np.eye(2), 500),
            ),
            axis=0,
        )
        y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Categorical])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        l0 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 0, x))
        l1 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 1, x))
        l2 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 2, x))

        np.testing.assert_array_almost_equal(l0 + l1 + l2, 1.0)

        self.assertTrue(np.all(l0[1000:1500] > 0.85))
        self.assertTrue(np.all(l0[0:1000] < 0.15))

        self.assertTrue(np.all(l1[500:1000] > 0.85))
        self.assertTrue(np.all(l1[0:500] < 0.15))
        self.assertTrue(np.all(l1[1000:1500] < 0.15))

        self.assertTrue(np.all(l2[0:500] > 0.85))
        self.assertTrue(np.all(l2[500:15000] < 0.15))
Пример #2
0
    def test_leaf_bernoulli_bootstrap(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 100),
                np.random.multivariate_normal([1, 1], np.eye(2), 100),
            ),
            axis=0,
        )
        y = np.array([1] * 100 + [0] * 100).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Bernoulli])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        l = likelihood(leaf, data)
        neg_data = np.concatenate([1 - y, x], axis=1)
        lneg = likelihood(leaf, neg_data)

        np.testing.assert_array_almost_equal(l + lneg, 1.0)

        self.assertTrue(np.all(l >= 0.5))
        self.assertTrue(np.all(lneg < 0.5))
Пример #3
0
    def test_leaf_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array(
            np.random.normal(20, 2, 5000).tolist() +
            np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1)

        # associates y=20 with X=[10,10]
        # associates y=60 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        self.assertFalse(np.any(np.isnan(likelihood(leaf, data))))

        self.assertGreater(get_ll(leaf, [20, 10, 10]),
                           get_ll(leaf, [20, 1, 1]))
        self.assertGreater(get_ll(leaf, [60, 1, 1]),
                           get_ll(leaf, [60, 10, 10]))
        self.assertAlmostEqual(get_ll(leaf, [60, 1, 1]), 0.3476232862652)
        self.assertAlmostEqual(get_ll(leaf, [20, 10, 10]), 0.3628922322773634)
Пример #4
0
    def test_leaf_no_variance_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 500),
                np.random.multivariate_normal([1, 1], np.eye(2), 500),
            ),
            axis=0,
        )
        y = np.array([1] * 1000).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.398942280401432)

        data[:, 0] = 2
        leaf = create_conditional_leaf(data, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.398942280401432)

        data3 = np.array(data)
        data3[:, 0] = 3
        leaf = create_conditional_leaf(data3, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertAlmostEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.241970724519143)
Пример #5
0
    def test_naive_factorization(self):
        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)
        result = naive_factorization(data=data2,
                                     parent=parent,
                                     pos=0,
                                     context=ctx,
                                     scope=list(scope))

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(parent.children[0], result[0][1]['parent'])

        y, x = get_YX(data, 4)

        self.assertEqual(len(result), len(scope))
        for i, s in enumerate(scope):
            r = result[i]
            self.assertEqual(len(r), 2)
            self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE)
            self.assertEqual(type(r[1]['parent']), Product)
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(r[1]['scope'], [s])
            self.assertListEqual(r[1]['data'].tolist(),
                                 concatenate_yx(y[:, i], x).tolist())
Пример #6
0
    def test_conditional(self):
        labels = np.c_[np.zeros((500, 1)), np.ones((500, 1))]
        features = np.c_[
            np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))]
        ]

        train_data = concatenate_yx(labels, features)

        ds_context = Context(
            parametric_types=[Bernoulli] * labels.shape[1]
        ).add_domains(labels)
        ds_context.feature_size = 2

        def label_conditional(y, x):
            from sklearn.cluster import KMeans

            clusters = KMeans(
                n_clusters=2, random_state=17, precompute_distances=True
            ).fit_predict(y)
            return clusters

        spn = learn_cspn_structure(
            train_data,
            ds_context,
            split_rows=get_split_conditional_rows_KMeans(),
            split_cols=getCIGroup(),
            create_leaf=create_conditional_leaf,
            label_conditional=label_conditional,
            cluster_univariate=True,
        )
Пример #7
0
    def test_leaf_mpe_bernoulli(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array([0] * 5000 + [1] * 5000).reshape(-1, 1)

        # associates y=0 with X=[10,10]
        # associates y=1 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Bernoulli])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 0)

        res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 1)

        res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 1)
        self.assertAlmostEqual(res[1, 0], 0)

        with self.assertRaises(AssertionError):
            mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
Пример #8
0
    def test_leaf_mpe_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array(np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1)

        # associates y=20 with X=[10,10]
        # associates y=60 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        # leaf = create_conditional_leaf(data, ds_context, [0])
        leaf = create_parametric_leaf(data, ds_context, [0])

        res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 20.435226001909466)

        res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 59.4752193542575)

        res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 59.4752193542575)
        self.assertAlmostEqual(res[1, 0], 20.435226001909466)

        with self.assertRaises(AssertionError):
            mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
Пример #9
0
    def test_create_conditional(self):

        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)

        K = int(data.shape[0] * 0.25)
        split_idx = np.array([0] * K + [1] * (data.shape[0] - K))
        np.random.shuffle(split_idx)

        y, x = get_YX(data, 4)

        def label_conditional(local_y, local_x):
            self.assertListEqual(local_y.tolist(), y.tolist())
            self.assertListEqual(local_x.tolist(), x.tolist())
            return split_idx

        result = create_conditional(data=data2,
                                    parent=parent,
                                    pos=0,
                                    context=ctx,
                                    scope=list(scope),
                                    label_conditional=label_conditional)

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(len(result), 2)

        for i, r in enumerate(result):
            self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP)
            self.assertIn('data', r[1])
            self.assertEqual(parent.children[0], r[1]['parent'])
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(scope, r[1]['scope'])
            self.assertEqual(r[1]['data'].shape[1], data.shape[1])

        conditional_node = result[0][1]['parent']

        child_idx = conditional_supervised_likelihood(
            conditional_node,
            [np.zeros((data.shape[0], 1)),
             np.ones((data.shape[0], 1))], data)

        self.assertListEqual(result[0][1]['data'].tolist(),
                             data[child_idx[:, 0] == 0, :].tolist())
        self.assertListEqual(result[1][1]['data'].tolist(),
                             data[child_idx[:, 0] == 1, :].tolist())
Пример #10
0
    def test_create_sum_with_split(self):
        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)

        K = int(data.shape[0] * 0.25)
        split_idx = np.array([0] * K + [1] * (data.shape[0] - K))
        np.random.shuffle(split_idx)

        def split_rows(data, context, scope):
            result = []
            result.append((data[split_idx == 0, :], scope, 0.25))
            result.append((data[split_idx == 1, :], scope, 0.75))
            return result

        result = create_sum(data=data2,
                            parent=parent,
                            pos=0,
                            context=ctx,
                            scope=list(scope),
                            split_rows=split_rows,
                            split_on_sum=True)

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(len(result), 2)
        for i, r in enumerate(result):
            self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP)
            self.assertIn('data', r[1])
            self.assertEqual(parent.children[0], r[1]['parent'])
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(scope, r[1]['scope'])
            self.assertEqual(r[1]['data'].shape[1], data.shape[1])
            self.assertEqual(r[1]['data'].shape[0],
                             int(np.sum(split_idx == i)))

        self.assertListEqual(result[0][1]['data'].tolist(),
                             data[split_idx == 0, :].tolist())
        self.assertListEqual(result[1][1]['data'].tolist(),
                             data[split_idx == 1, :].tolist())
        self.assertAlmostEqual(np.sum(parent.children[0].weights), 1.0)
Пример #11
0
    def test_leaf_sampling(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array(
            np.random.normal(20, 2, 5000).tolist() +
            np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1)

        # associates y=20 with X=[10,10]
        # associates y=60 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        res = sample_instances(
            leaf,
            np.array([np.nan, 10, 10] * 1000).reshape(-1, 3), 17)
        self.assertAlmostEqual(np.mean(res[:, 0]), 20.456669723751173)

        res = sample_instances(leaf,
                               np.array([np.nan, 1, 1] * 1000).reshape(-1, 3),
                               17)
        self.assertAlmostEqual(np.mean(res[:, 0]), 59.496663076099196)

        res = sample_instances(
            leaf,
            np.array([np.nan, 1, 1, np.nan, 10, 10] * 1000).reshape(-1, 3), 17)
        self.assertAlmostEqual(np.mean(res[::2, 0]), 59.546359637084564)
        self.assertAlmostEqual(np.mean(res[1::2, 0]), 20.452118792501008)

        with self.assertRaises(AssertionError):
            sample_instances(
                leaf,
                np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10,
                          10]).reshape(-1, 3), 17)
Пример #12
0
    def test_leaf_no_variance_bernoulli(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 500),
                np.random.multivariate_normal([1, 1], np.eye(2), 500),
            ),
            axis=0,
        )
        y = np.array([1] * 1000).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Bernoulli])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertTrue(np.all(l >= 0.5))
Пример #13
0
    def test_remove_non_informative_features(self):
        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)
        data[:, 1] = 1
        data[:, 3] = 3

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)

        y, x = get_YX(data, 4)

        uninformative_features_idx = np.var(y, 0) == 0
        result = remove_non_informative_features(
            data=data2,
            parent=parent,
            pos=0,
            context=ctx,
            scope=list(scope),
            uninformative_features_idx=uninformative_features_idx)

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(len(parent.children[0].children), len(result))

        resulting_scopes = [[3], [6], [1, 4]]
        resulting_data_y = [y[:, 1], y[:, 3], y[:, [0, 2]]]

        for i, r in enumerate(result):
            self.assertEqual(len(r), 2)
            self.assertEqual(type(r[1]['parent']), Product)
            self.assertEqual(parent.children[0], r[1]['parent'])
            self.assertListEqual(r[1]['scope'], resulting_scopes[i])
            self.assertEqual(r[1]['pos'], i)

            self.assertListEqual(
                r[1]['data'].tolist(),
                concatenate_yx(resulting_data_y[i], x).tolist())
Пример #14
0
    def test_leaf_sampling_categorical(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([20, 20], np.eye(2), 500),
                np.random.multivariate_normal([10, 10], np.eye(2), 500),
                np.random.multivariate_normal([1, 1], np.eye(2), 500),
            ),
            axis=0,
        )
        y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Categorical])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        res = sample_instances(
            leaf,
            np.array([np.nan, 10, 10] * 1000).reshape(-1, 3), RandomState(17))
        self.assertAlmostEqual(np.mean(res[:, 0]), 1, 1)

        res = sample_instances(leaf,
                               np.array([np.nan, 1, 1] * 1000).reshape(-1, 3),
                               RandomState(17))
        self.assertAlmostEqual(np.mean(res[:, 0]), 0, 1)

        res = sample_instances(
            leaf,
            np.array([np.nan, 1, 1, np.nan, 10, 10] * 1000).reshape(-1, 3),
            RandomState(17))
        self.assertAlmostEqual(np.mean(res[::2, 0]), 0, 1)
        self.assertAlmostEqual(np.mean(res[1::2, 0]), 1, 1)

        with self.assertRaises(AssertionError):
            sample_instances(
                leaf,
                np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10,
                          10]).reshape(-1, 3), RandomState(17))