示例#1
0
    def test_naive_factorization(self):
        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)
        result = naive_factorization(data=data2,
                                     parent=parent,
                                     pos=0,
                                     context=ctx,
                                     scope=list(scope))

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(parent.children[0], result[0][1]['parent'])

        y, x = get_YX(data, 4)

        self.assertEqual(len(result), len(scope))
        for i, s in enumerate(scope):
            r = result[i]
            self.assertEqual(len(r), 2)
            self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE)
            self.assertEqual(type(r[1]['parent']), Product)
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(r[1]['scope'], [s])
            self.assertListEqual(r[1]['data'].tolist(),
                                 concatenate_yx(y[:, i], x).tolist())
示例#2
0
    def test_conditional(self):
        labels = np.c_[np.zeros((500, 1)), np.ones((500, 1))]
        features = np.c_[
            np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))]
        ]

        train_data = concatenate_yx(labels, features)

        ds_context = Context(
            parametric_types=[Bernoulli] * labels.shape[1]
        ).add_domains(labels)
        ds_context.feature_size = 2

        def label_conditional(y, x):
            from sklearn.cluster import KMeans

            clusters = KMeans(
                n_clusters=2, random_state=17, precompute_distances=True
            ).fit_predict(y)
            return clusters

        spn = learn_cspn_structure(
            train_data,
            ds_context,
            split_rows=get_split_conditional_rows_KMeans(),
            split_cols=getCIGroup(),
            create_leaf=create_conditional_leaf,
            label_conditional=label_conditional,
            cluster_univariate=True,
        )
示例#3
0
def learn_parametric_spn(data, parametric_types):
    
    from spn.algorithms.LearningWrappers import learn_parametric
    ds_context = Context(parametric_types=parametric_types).add_domains(data)
    ds_context.add_domains(data)
    spn = learn_parametric(data, ds_context, min_instances_slice=100, threshold=0.01)
    return spn
示例#4
0
    def test_leaf_mpe_bernoulli(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array([0] * 5000 + [1] * 5000).reshape(-1, 1)

        # associates y=0 with X=[10,10]
        # associates y=1 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Bernoulli])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 0)

        res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 1)

        res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 1)
        self.assertAlmostEqual(res[1, 0], 0)

        with self.assertRaises(AssertionError):
            mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
示例#5
0
    def test_optimization(self):
        np.random.seed(17)
        d1 = np.random.normal(10, 1, size=4000).tolist()
        d2 = np.random.normal(30, 1, size=4000).tolist()
        data = d1 + d2
        data = np.array(data).reshape((-1, 4))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])
        ds_context.add_domains(data)

        spn = learn_parametric(data, ds_context)

        spn.weights = [0.8, 0.2]
        spn.children[0].children[0].mean = 3.0

        py_ll = np.sum(log_likelihood(spn, data))

        print(spn.weights, spn.children[0].children[0].mean)

        EM_optimization(spn, data, iterations=1000)

        print(spn.weights, spn.children[0].children[0].mean)

        py_ll_opt = np.sum(log_likelihood(spn, data))

        self.assertLessEqual(py_ll, py_ll_opt)
        self.assertAlmostEqual(spn.weights[0], 0.5, 4)
        self.assertAlmostEqual(spn.weights[1], 0.5, 4)

        c1_mean = spn.children[0].children[0].mean
        c2_mean = spn.children[1].children[0].mean
        self.assertEqual(round(min(c1_mean, c2_mean)), 10)
        self.assertEqual(round(max(c1_mean, c2_mean)), 30)
示例#6
0
    def test_leaf_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array(
            np.random.normal(20, 2, 5000).tolist() +
            np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1)

        # associates y=20 with X=[10,10]
        # associates y=60 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        self.assertFalse(np.any(np.isnan(likelihood(leaf, data))))

        self.assertGreater(get_ll(leaf, [20, 10, 10]),
                           get_ll(leaf, [20, 1, 1]))
        self.assertGreater(get_ll(leaf, [60, 1, 1]),
                           get_ll(leaf, [60, 10, 10]))
        self.assertAlmostEqual(get_ll(leaf, [60, 1, 1]), 0.3476232862652)
        self.assertAlmostEqual(get_ll(leaf, [20, 10, 10]), 0.3628922322773634)
示例#7
0
    def test_leaf_mpe_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array(np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1)

        # associates y=20 with X=[10,10]
        # associates y=60 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        # leaf = create_conditional_leaf(data, ds_context, [0])
        leaf = create_parametric_leaf(data, ds_context, [0])

        res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 20.435226001909466)

        res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 59.4752193542575)

        res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 59.4752193542575)
        self.assertAlmostEqual(res[1, 0], 20.435226001909466)

        with self.assertRaises(AssertionError):
            mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
示例#8
0
 def test_conditional_probability(self):
     # test if conditional probability is correct
     # same spn as in entropy test
     # only for generating the ds_context
     train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]])
     # spn
     ds_context = Context(meta_types=[MetaType.DISCRETE] * 3)
     ds_context.add_domains(train_data)
     ds_context.parametric_type = [Categorical] * 3
     spn = 0.64 * (
         (
             Categorical(p=[0.25, 0.75, 0.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     ) + 0.36 * (
         (
             Categorical(p=[0.0, 0.0, 1.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     )
     # tests
     x_instance = np.array([1, 1, 0], dtype=float).reshape(1, -1)
     self.assertAlmostEqual(conditional_probability(spn, 2, x_instance)[0][0], 0.9)
     self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.48)
     x_instance = np.array([2, 1, 0], dtype=float).reshape(1, -1)
     self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.36)
示例#9
0
    def test_leaf_no_variance_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 500),
                np.random.multivariate_normal([1, 1], np.eye(2), 500),
            ),
            axis=0,
        )
        y = np.array([1] * 1000).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.398942280401432)

        data[:, 0] = 2
        leaf = create_conditional_leaf(data, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.398942280401432)

        data3 = np.array(data)
        data3[:, 0] = 3
        leaf = create_conditional_leaf(data3, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertAlmostEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.241970724519143)
示例#10
0
    def test_leaf_categorical(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([20, 20], np.eye(2), 500),
                np.random.multivariate_normal([10, 10], np.eye(2), 500),
                np.random.multivariate_normal([1, 1], np.eye(2), 500),
            ),
            axis=0,
        )
        y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Categorical])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        l0 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 0, x))
        l1 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 1, x))
        l2 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 2, x))

        np.testing.assert_array_almost_equal(l0 + l1 + l2, 1.0)

        self.assertTrue(np.all(l0[1000:1500] > 0.85))
        self.assertTrue(np.all(l0[0:1000] < 0.15))

        self.assertTrue(np.all(l1[500:1000] > 0.85))
        self.assertTrue(np.all(l1[0:500] < 0.15))
        self.assertTrue(np.all(l1[1000:1500] < 0.15))

        self.assertTrue(np.all(l2[0:500] > 0.85))
        self.assertTrue(np.all(l2[500:15000] < 0.15))
示例#11
0
def get_ds_context_sum(curr_train_data, scope, index, scope_index, params):
    """
    returns the Context object of spflow to use with split_rows method while creating sum node for spmn

    """
    n = curr_train_data.shape[1]
    curr_var_set_sum = params.partial_order[index:len(params.partial_order) +
                                            1]
    curr_var_set_sum1 = [
        var for curr_var_set in curr_var_set_sum for var in curr_var_set
    ]

    if params.util_to_bin:
        context = [Categorical] * n
        ds_context = Context(
            parametric_types=context,
            scope=scope,
            feature_names=curr_var_set_sum1).add_domains(curr_train_data)

    # utilty is meta type -- real
    else:

        if params.utility_node[0] in curr_var_set_sum1:
            context = [MetaType.DISCRETE] * (n - 1)
            context.append(MetaType.REAL)
        else:
            context = [MetaType.DISCRETE] * (n)
        scope = scope
        ds_context = Context(
            meta_types=context, scope=scope,
            feature_names=curr_var_set_sum1).add_domains(curr_train_data)

    return ds_context
示例#12
0
def get_ds_context_prod(curr_train_data, scope, index, scope_index, params):
    """
    returns the Context object of spflow to use with split_cols, learn_mspn or learn_parametric methods of spflow while creating product node for spmn
    """
    n = curr_train_data.shape[1]
    scope_var = params.feature_names[scope_index:scope_index + n]
    context = []

    # if parametric, all variables are meta type -- categorical
    if params.util_to_bin:
        context = [Categorical] * n
        ds_context = Context(
            parametric_types=context, scope=scope,
            feature_names=scope_var).add_domains(curr_train_data)

    # if mixed, utilty is meta type -- real
    else:
        if params.utility_node[0] in scope_var:
            context = [MetaType.DISCRETE] * (n - 1)
            context.append(MetaType.REAL)
        else:
            context = [MetaType.DISCRETE] * (n)

        scope = scope
        ds_context = Context(
            meta_types=context, scope=scope,
            feature_names=scope_var).add_domains(curr_train_data)
    return ds_context
示例#13
0
    def test_leaf_bernoulli_bootstrap(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 100),
                np.random.multivariate_normal([1, 1], np.eye(2), 100),
            ),
            axis=0,
        )
        y = np.array([1] * 100 + [0] * 100).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Bernoulli])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        l = likelihood(leaf, data)
        neg_data = np.concatenate([1 - y, x], axis=1)
        lneg = likelihood(leaf, neg_data)

        np.testing.assert_array_almost_equal(l + lneg, 1.0)

        self.assertTrue(np.all(l >= 0.5))
        self.assertTrue(np.all(lneg < 0.5))
示例#14
0
    def test_histogram_to_str_and_back(self):

        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=False)

        self.check_obj_and_reconstruction(hist)
示例#15
0
 def test_histogram_leaf(self):
     data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
     ds_context = Context([MetaType.DISCRETE])
     ds_context.add_domains(data)
     hist = create_histogram_leaf(data, ds_context, [0], alpha=False)
     self.assertTrue(
         np.array_equal(mpe(hist, np.array([[np.nan]])), np.array([[3]])),
         "mpe should be 3")
示例#16
0
 def test_valid_histogram(self):
     np.random.seed(17)
     data = [1] + [5]*20 + [7] + [10]*50 + [20] + [30]*10
     data = np.array(data).reshape((-1, 1))
     ds_context = Context([MetaType.REAL])
     ds_context.add_domains(data)
     
     hist = create_histogram_leaf(data, ds_context, [0], alpha=False, hist_source="kde")
     self.assertGreater(len(hist.bin_repr_points), 1)
示例#17
0
 def test_PWL_no_variance(self):
     data = np.array([1.0, 1.0]).reshape(-1, 1)
     ds_context = Context([MetaType.REAL])
     ds_context.add_domains(data)
     with self.assertRaises(AssertionError):
         create_piecewise_leaf(data,
                               ds_context,
                               scope=[0],
                               hist_source="kde")
示例#18
0
    def test_PWL_no_variance(self):
        data = np.array([1.0, 1.0]).reshape(-1, 1)
        ds_context = Context([MetaType.REAL])
        ds_context.add_domains(data)
        leaf = create_piecewise_leaf(data, ds_context, scope=[0], hist_source="kde")
        prob = np.exp(log_likelihood(leaf, data))

        self.assertAlmostEqual(float(prob[0]), 2 / 6)
        self.assertAlmostEqual(float(prob[1]), 2 / 6)
示例#19
0
 def test_we_score(self):
     # test if we_score is correct
     """
     # explain how training data and the spn comes
     # number of RVs
     M = 3
     # table of probabilities
     p1 = 0.6
     p2 = 0.3
     p31 = 0.1
     p32 = 0.9
     # generate x1 and x2
     x1 = np.random.binomial(1, p1, size=N) + np.random.binomial(1, p1, size=N)
     x2 = np.random.binomial(1, p2, size=N)
     x3 = np.zeros(N)
     # generate x3
     for i in range(N):
         if x2[i] == 1:
             x3[i] = np.random.binomial(1, p31, size=1)
         else:
             x3[i] = np.random.binomial(1, p32, size=1)
     # form a matrix, rows are instances and columns are RVs
     train_data = np.concatenate((x1, x2, x3)).reshape((M, N)).transpose()
     """
     # only for generating the ds_context
     train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]])
     # spn
     ds_context = Context(meta_types=[MetaType.DISCRETE] * 3)
     ds_context.add_domains(train_data)
     ds_context.parametric_type = [Categorical] * 3
     spn = 0.64 * (
         (
             Categorical(p=[0.25, 0.75, 0.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     ) + 0.36 * (
         (
             Categorical(p=[0.0, 0.0, 1.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     )
     # test
     n = 40000
     x_instance = np.array([1, 1, 0], dtype=float).reshape(1, -1)
     y_index = 0
     we = weight_of_evidence(spn, 0, x_instance, n, ds_context.domains[y_index].shape[0])
     we_true = np.array([[np.nan, 0, 0]])
     we = we[~np.isnan(we)]
     we_true = we_true[~np.isnan(we_true)]
     self.assertTrue((we == we_true).all())
示例#20
0
    def test_create_conditional(self):

        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)

        K = int(data.shape[0] * 0.25)
        split_idx = np.array([0] * K + [1] * (data.shape[0] - K))
        np.random.shuffle(split_idx)

        y, x = get_YX(data, 4)

        def label_conditional(local_y, local_x):
            self.assertListEqual(local_y.tolist(), y.tolist())
            self.assertListEqual(local_x.tolist(), x.tolist())
            return split_idx

        result = create_conditional(data=data2,
                                    parent=parent,
                                    pos=0,
                                    context=ctx,
                                    scope=list(scope),
                                    label_conditional=label_conditional)

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(len(result), 2)

        for i, r in enumerate(result):
            self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP)
            self.assertIn('data', r[1])
            self.assertEqual(parent.children[0], r[1]['parent'])
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(scope, r[1]['scope'])
            self.assertEqual(r[1]['data'].shape[1], data.shape[1])

        conditional_node = result[0][1]['parent']

        child_idx = conditional_supervised_likelihood(
            conditional_node,
            [np.zeros((data.shape[0], 1)),
             np.ones((data.shape[0], 1))], data)

        self.assertListEqual(result[0][1]['data'].tolist(),
                             data[child_idx[:, 0] == 0, :].tolist())
        self.assertListEqual(result[1][1]['data'].tolist(),
                             data[child_idx[:, 0] == 1, :].tolist())
示例#21
0
def run_experiment(dataset, top_n_features, linear=False):
    ds_name, words, data, train, _, statistical_type, _ = dataset

    data = data[:, 0:top_n_features]
    words = words[0:top_n_features]
    train = train[:, 0:top_n_features]

    ds_context = Context()
    ds_context.statistical_type = statistical_type
    add_domains(data, ds_context)

    spn = learn_mspn(train, ds_context, linear=linear, memory=memory)
    save_exp(spn, ds_name, top_n_features, words, data)
示例#22
0
    def test_Histogram_discrete_inference(self):
        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=False)
        prob = np.exp(log_likelihood(hist, data))

        self.assertAlmostEqual(float(prob[0]), 2 / 6)
        self.assertAlmostEqual(float(prob[1]), 2 / 6)
        self.assertAlmostEqual(float(prob[2]), 1 / 6)
        self.assertAlmostEqual(float(prob[3]), 3 / 6)
        self.assertAlmostEqual(float(prob[4]), 3 / 6)
        self.assertAlmostEqual(float(prob[5]), 3 / 6)

        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=True)
        # print(np.var(data.shape[0]))
        prob = np.exp(log_likelihood(hist, data))
        self.assertAlmostEqual(float(prob[0]), 3 / 9)
        self.assertAlmostEqual(float(prob[1]), 3 / 9)
        self.assertAlmostEqual(float(prob[2]), 2 / 9)
        self.assertAlmostEqual(float(prob[3]), 4 / 9)
        self.assertAlmostEqual(float(prob[4]), 4 / 9)
        self.assertAlmostEqual(float(prob[5]), 4 / 9)
示例#23
0
    def test_PWL(self):
        #data = np.array([1.0, 1.0, 2.0, 3.0]*100).reshape(-1, 1)

        data = np.r_[np.random.normal(10, 5, (300, 1)),
                     np.random.normal(20, 10, (700, 1))]

        ds_context = Context([MetaType.REAL])
        ds_context.add_domains(data)
        leaf = create_piecewise_leaf(data,
                                     ds_context,
                                     scope=[0],
                                     prior_weight=None,
                                     hist_source="kde")
        prob = np.exp(log_likelihood(leaf, data))
示例#24
0
    def test_create_sum_with_split(self):
        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)

        K = int(data.shape[0] * 0.25)
        split_idx = np.array([0] * K + [1] * (data.shape[0] - K))
        np.random.shuffle(split_idx)

        def split_rows(data, context, scope):
            result = []
            result.append((data[split_idx == 0, :], scope, 0.25))
            result.append((data[split_idx == 1, :], scope, 0.75))
            return result

        result = create_sum(data=data2,
                            parent=parent,
                            pos=0,
                            context=ctx,
                            scope=list(scope),
                            split_rows=split_rows,
                            split_on_sum=True)

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(len(result), 2)
        for i, r in enumerate(result):
            self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP)
            self.assertIn('data', r[1])
            self.assertEqual(parent.children[0], r[1]['parent'])
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(scope, r[1]['scope'])
            self.assertEqual(r[1]['data'].shape[1], data.shape[1])
            self.assertEqual(r[1]['data'].shape[0],
                             int(np.sum(split_idx == i)))

        self.assertListEqual(result[0][1]['data'].tolist(),
                             data[split_idx == 0, :].tolist())
        self.assertListEqual(result[1][1]['data'].tolist(),
                             data[split_idx == 1, :].tolist())
        self.assertAlmostEqual(np.sum(parent.children[0].weights), 1.0)
示例#25
0
 def test_mixture_gaussians(self):
     np.random.seed(17)
     data = np.random.normal(10, 1, size=200).tolist() + np.random.normal(30, 1, size=200).tolist()
     data = np.array(data).reshape((-1, 1))
     ds_context = Context([MetaType.REAL])
     ds_context.add_domains(data)
     hist = create_histogram_leaf(data, ds_context, [0], alpha=False, hist_source="kde")
     x = np.linspace(0, 60, 1000).tolist() + data[:, 0].tolist()
     x = np.sort(x)
     from scipy.stats import norm
     y = 0.5 * norm.pdf(x, 10, 1) + 0.5 * norm.pdf(x, 30, 1)
     ye = likelihood(hist, x.reshape((-1, 1)))
     error = np.sum(np.abs(ye[:, 0] - y))
     # print(error)
     self.assertLessEqual(error, 7)
示例#26
0
def get_ds_context(data, scope, params):
    """
    :param data: numpy array of data for Context object
    :param scope: scope of data
    :param params: params of SPMN
    :return: Context object of SPFlow
    """

    num_of_variables = data.shape[1]
    scope_var = np.array(params.feature_names)[scope].tolist()
    ds_context = Context(meta_types=[params.meta_types[i] for i in scope],
                         scope=scope,
                         feature_names=scope_var)
    ds_context.add_domains(data)
    return ds_context
示例#27
0
def learn_PSPN():
    import numpy as np

    np.random.seed(123)

    a = np.random.randint(2, size=1000).reshape(-1, 1)
    b = np.random.randint(3, size=1000).reshape(-1, 1)
    c = np.r_[np.random.normal(10, 5, (300, 1)),
              np.random.normal(20, 10, (700, 1))]
    d = 5 * a + 3 * b + c
    train_data = np.c_[a, b, c, d]

    from spn.structure.Base import Context
    from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian

    ds_context = Context(
        parametric_types=[Categorical, Categorical, Gaussian, Gaussian
                          ]).add_domains(train_data)

    from spn.algorithms.LearningWrappers import learn_parametric

    spn = learn_parametric(train_data, ds_context, min_instances_slice=20)

    from spn.algorithms.Statistics import get_structure_stats

    print(get_structure_stats(spn))
示例#28
0
def learn_MSPN():
    import numpy as np

    np.random.seed(123)

    a = np.random.randint(2, size=1000).reshape(-1, 1)
    b = np.random.randint(3, size=1000).reshape(-1, 1)
    c = np.r_[np.random.normal(10, 5, (300, 1)),
              np.random.normal(20, 10, (700, 1))]
    d = 5 * a + 3 * b + c
    train_data = np.c_[a, b, c, d]

    from spn.structure.Base import Context
    from spn.structure.StatisticalTypes import MetaType

    ds_context = Context(meta_types=[
        MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL
    ]).add_domains(train_data)

    from spn.algorithms.LearningWrappers import learn_mspn

    mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)

    from spn.algorithms.Statistics import get_structure_stats

    print(get_structure_stats(mspn))
示例#29
0
def learn_whittle_spn_2d(train_data, n_RV, n_min_slice, init_scope=None):
    from spn.structure.leaves.parametric.Parametric import MultivariateGaussian

    # learn spn
    ds_context = Context(parametric_types=[MultivariateGaussian] *
                         n_RV).add_domains(train_data)

    print('learning WSPN')
    # need to pair RVs
    # need flag for 2d?
    l_rfft = get_l_rfft(args)
    # l_rfft!=None --> 2d/pair gaussian node, is_2d=True --> pairwise gaussian, full covariance matrix
    wspn = learn_parametric(train_data,
                            ds_context,
                            min_instances_slice=n_min_slice,
                            threshold=args.threshold,
                            initial_scope=init_scope,
                            cpus=1,
                            l_rfft=l_rfft,
                            is_2d=True)
    save_path = get_save_path(args)
    check_path(save_path)
    f = open(save_path + 'wspn_2d.pkl', 'wb')
    pickle.dump(wspn, f)
    f.close()

    return wspn
示例#30
0
    def test_learn(self):
        from sklearn.datasets import load_iris

        iris = load_iris()
        X = iris.data
        y = iris.target.reshape(-1, 1)

        train_data = np.hstack((X, y))

        from spn.algorithms.LearningWrappers import learn_parametric, learn_classifier
        from spn.structure.leaves.parametric.Parametric import Categorical, MultivariateGaussian
        from spn.structure.Base import Context

        spn_classification = learn_parametric(
            train_data,
            Context(
                parametric_types=[
                    MultivariateGaussian,
                    MultivariateGaussian,
                    MultivariateGaussian,
                    MultivariateGaussian,
                    Categorical,
                ]
            ).add_domains(train_data),
            multivariate_leaf=True,
        )