예제 #1
0
def test_linked_naive_factorization():
    spn = SpnFactory.linked_naive_factorization(vars)
    print('Naive factorization (indicators)')
    print(spn)

    spn = SpnFactory.linked_naive_factorization(vars, naive_freqs)
    print('Naive factorization (smoothing)')
    print(spn)
예제 #2
0
def test_theano_naive_factorization():
    spn = SpnFactory.theano_naive_factorization(vars)
    print('Naive factorization (indicators)')
    print(spn)

    spn = SpnFactory.theano_naive_factorization(vars,
                                                naive_freqs)
    print('Naive factorization (smoothing)')
    print(spn)
예제 #3
0
def atest_theano_kernel_density_estimation():
    num_instances = 5
    spn = SpnFactory.theano_kernel_density_estimation(num_instances, vars)
    print('Kernel density estimation (indicators)')
    print(spn)
    spn = SpnFactory.theano_kernel_density_estimation(num_instances,
                                                      vars,
                                                      sparse=True)
    print('Sparse kernel density estimation')
    print(spn)

    spn = SpnFactory.theano_kernel_density_estimation(num_instances, vars,
                                                      freqs)
    print('Kernel density estimation (smoothing)')
    print(spn)
예제 #4
0
def atest_theano_nltcs_kernel_spn():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.theano_kernel_density_estimation(
        n_instances,
        features,
        batch_size=n_test_instances,
        sparse=True)
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    print(avg_ll)
예제 #5
0
def atest_theano_nltcs_kernel_spn():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.theano_kernel_density_estimation(
        n_instances,
        features,
        batch_size=n_test_instances,
        sparse=True)
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    print(avg_ll)
예제 #6
0
def test_linked_kernel_density_estimation():
    num_instances = 5
    spn = SpnFactory.linked_kernel_density_estimation(num_instances,
                                                      vars)
    print('Kernel density estimation')
    print(spn)
    print(spn.stats())
예제 #7
0
def test_linked_kernel_density_estimation():
    num_instances = 5
    spn = SpnFactory.linked_kernel_density_estimation(num_instances,
                                                      vars)
    print('Kernel density estimation')
    print(spn)
    print(spn.stats())
예제 #8
0
def test_random_spn_em():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    n_layers = 2
    n_max_children = 4
    n_scope_children = 5
    max_scope_split = 3
    merge_prob = 0.5
    print('Build random spn')
    spn = SpnFactory.linked_random_spn_top_down(features, n_layers,
                                                n_max_children,
                                                n_scope_children,
                                                max_scope_split, merge_prob)

    assert spn.is_valid()
    print('Stats\n')
    print(spn.stats())

    spn.fit_em(train, valid, test, hard=False, n_epochs=10)
예제 #9
0
def test_linked_nltcs_kernel_spn_perf():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('ninst', n_instances, 'feats', features)
    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances,
                                                      features)
    print(spn.stats())
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    eval_start_t = perf_counter()
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    eval_end_t = perf_counter()
    print('AVG LL {0} in {1} secs'.format(avg_ll, eval_end_t - eval_start_t))
예제 #10
0
def test_linked_nltcs_kernel_spn_perf():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('ninst', n_instances, 'feats', features)
    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances,
                                                      features)
    print(spn.stats())
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    eval_start_t = perf_counter()
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    eval_end_t = perf_counter()
    print('AVG LL {0} in {1} secs'.format(avg_ll, eval_end_t - eval_start_t))
예제 #11
0
def test_linked_random_spn_top_down():
    # number small parameters
    n_levels = 10
    vars = [2, 3, 2, 2, 4]
    n_max_children = 2
    n_scope_children = 3
    max_scope_split = 2
    merge_prob = 0.5

    # building it
    print('creating random spn')
    rand_gen = random.Random(789)

    #
    # doing this for more than once
    n_times = 10
    for _i in range(n_times):
        spn = SpnFactory.linked_random_spn_top_down(vars,
                                                    n_levels,
                                                    n_max_children,
                                                    n_scope_children,
                                                    max_scope_split,
                                                    merge_prob,
                                                    rand_gen=rand_gen)

        # printing for comparison
        print(spn)
        print(spn.stats())
        assert spn.is_valid()

        # translating to theano representation
        theano_spn = SpnFactory.linked_to_theano(spn)

        print(theano_spn)
        print(theano_spn.stats())

        #
        # looking for the same computations
        # time for some inference comparison
        for instance in II:
            print('linked')
            res_l = spn.eval(instance)
            print(res_l)
            print('theano')
            res_t = theano_spn.eval(instance)
            print(res_t)
            assert_array_almost_equal(res_l, res_t)
예제 #12
0
def test_linked_random_spn_top_down():
    # number small parameters
    n_levels = 10
    vars = [2, 3, 2, 2, 4]
    n_max_children = 2
    n_scope_children = 3
    max_scope_split = 2
    merge_prob = 0.5

    # building it
    print('creating random spn')
    rand_gen = random.Random(789)

    #
    # doing this for more than once
    n_times = 10
    for _i in range(n_times):
        spn = SpnFactory.linked_random_spn_top_down(vars,
                                                    n_levels,
                                                    n_max_children,
                                                    n_scope_children,
                                                    max_scope_split,
                                                    merge_prob,
                                                    rand_gen=rand_gen)

        # printing for comparison
        print(spn)
        print(spn.stats())
        assert spn.is_valid()

        # translating to theano representation
        theano_spn = SpnFactory.linked_to_theano(spn)

        print(theano_spn)
        print(theano_spn.stats())

        #
        # looking for the same computations
        # time for some inference comparison
        for instance in II:
            print('linked')
            res_l = spn.eval(instance)
            print(res_l)
            print('theano')
            res_t = theano_spn.eval(instance)
            print(res_t)
            assert_array_almost_equal(res_l, res_t)
예제 #13
0
def atest_theano_kernel_density_estimation():
    num_instances = 5
    spn = SpnFactory.theano_kernel_density_estimation(num_instances,
                                                      vars)
    print('Kernel density estimation (indicators)')
    print(spn)
    spn = SpnFactory.theano_kernel_density_estimation(num_instances,
                                                      vars,
                                                      sparse=True)
    print('Sparse kernel density estimation')
    print(spn)

    spn = SpnFactory.theano_kernel_density_estimation(num_instances,
                                                      vars,
                                                      freqs)
    print('Kernel density estimation (smoothing)')
    print(spn)
예제 #14
0
def test_theano_kernel_density_estimation_categorical():
    num_instances = 5
    spn = SpnFactory.theano_kernel_density_estimation(num_instances,
                                                      vars,
                                                      node_dict=freqs,
                                                      alpha=0.1)
    print('Sparse kernel density estimation' +
          'with smoothed categorical input layer')
    print(spn)
    print(spn.stats())
예제 #15
0
def test_theano_kernel_density_estimation_categorical():
    num_instances = 5
    spn = SpnFactory.theano_kernel_density_estimation(num_instances,
                                                      vars,
                                                      node_dict=freqs,
                                                      alpha=0.1)
    print('Sparse kernel density estimation' +
          'with smoothed categorical input layer')
    print(spn)
    print(spn.stats())
예제 #16
0
def atest_nltcs_em_fit():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances, features)
    print('EM training')

    spn.fit_em(train, valid, test, hard=True, epochs=2)
예제 #17
0
def atest_theano_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.theano_naive_factorization(features,
                                                freqs,
                                                alpha=0,
                                                batch_size=n_test_instances)
    print('Evaluating on test')
    ll = spn.eval(test.T)
    avg_ll = ll.mean()
    print(avg_ll)
예제 #18
0
def atest_theano_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.theano_naive_factorization(features,
                                                freqs,
                                                alpha=0,
                                                batch_size=n_test_instances)
    print('Evaluating on test')
    ll = spn.eval(test.T)
    avg_ll = ll.mean()
    print(avg_ll)
예제 #19
0
def aatest_linked_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')

    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.linked_naive_factorization(features, freqs, alpha=0)
    print('Evaluating on test')
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    avg_ll = numpy.mean(lls)
    print(avg_ll)
예제 #20
0
def test_random_spn_sgd():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    n_layers = 1
    n_max_children = 2000
    n_scope_children = 2000
    max_scope_split = -1
    merge_prob = 0.5
    seed = 1337
    rand_gen = random.Random(seed)

    print('Build random spn')
    spn = SpnFactory.linked_random_spn_top_down(features,
                                                n_layers,
                                                n_max_children,
                                                n_scope_children,
                                                max_scope_split,
                                                merge_prob,
                                                rand_gen=rand_gen)

    assert spn.is_valid()
    print('Stats\n')
    print(spn.stats())

    np_rand_gen = numpy.random.RandomState(seed)

    spn.fit_sgd(train,
                valid,
                test,
                learning_rate=0.2,
                n_epochs=10,
                batch_size=1,
                grad_method=1,
                validation_frequency=100,
                rand_gen=np_rand_gen,
                hard=False)
예제 #21
0
def aatest_linked_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')

    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.linked_naive_factorization(features,
                                                freqs,
                                                alpha=0)
    print('Evaluating on test')
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    avg_ll = numpy.mean(lls)
    print(avg_ll)
예제 #22
0
def test_sgd():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances, features)

    print('Created SPN with\n' + spn.stats())

    print('Starting SGD')
    spn.fit_sgd(train,
                valid,
                test,
                learning_rate=0.1,
                n_epochs=20,
                batch_size=1,
                hard=False)
예제 #23
0
def test_linked_to_theano_categorical():
    vars = [2, 2, 3, 4]
    freqs = [{
        'var': 0,
        'freqs': [1, 2]
    }, {
        'var': 1,
        'freqs': [2, 2]
    }, {
        'var': 0,
        'freqs': [3, 2]
    }, {
        'var': 1,
        'freqs': [0, 3]
    }, {
        'var': 2,
        'freqs': [1, 0, 2]
    }, {
        'var': 3,
        'freqs': [1, 2, 1, 2]
    }, {
        'var': 3,
        'freqs': [3, 4, 0, 1]
    }]

    # create input layer first
    input_layer = CategoricalSmoothedLayer(vars=vars, node_dicts=freqs)
    # get nodes
    ind_nodes = [node for node in input_layer.nodes()]

    root_node = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()

    sum3 = SumNode()
    sum4 = SumNode()

    # linking
    root_node.add_child(sum1)
    root_node.add_child(sum2)
    root_node.add_child(ind_nodes[0])
    root_node.add_child(ind_nodes[1])

    sum1.add_child(ind_nodes[2], 0.4)
    sum1.add_child(ind_nodes[3], 0.6)
    sum2.add_child(ind_nodes[3], 0.2)
    sum2.add_child(prod1, 0.5)
    sum2.add_child(prod2, 0.3)

    prod1.add_child(ind_nodes[4])
    prod1.add_child(sum3)
    prod1.add_child(sum4)
    prod2.add_child(sum3)
    prod2.add_child(sum4)

    sum3.add_child(ind_nodes[5], 0.5)
    sum3.add_child(ind_nodes[6], 0.5)
    sum4.add_child(ind_nodes[5], 0.4)
    sum4.add_child(ind_nodes[6], 0.6)

    # creating layers
    root_layer = ProductLayerLinked([root_node])
    sum_layer = SumLayerLinked([sum1, sum2])
    prod_layer = ProductLayerLinked([prod1, prod2])
    sum_layer2 = SumLayerLinked([sum3, sum4])

    # create the linked spn
    spn_linked = SpnLinked(
        input_layer=input_layer,
        layers=[sum_layer2, prod_layer, sum_layer, root_layer])

    print(spn_linked)

    # converting to theano repr
    spn_theano = SpnFactory.linked_to_theano(spn_linked)
    print(spn_theano)

    # time for some inference comparison
    for instance in I:
        print('linked')
        res_l = spn_linked.eval(instance)
        print(res_l)
        print('theano')
        res_t = spn_theano.eval(instance)
        print(res_t)
        assert_array_almost_equal(res_l, res_t)
예제 #24
0
def test_linked_to_theano_indicator():
    # creating single nodes
    root = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    ind1 = CategoricalIndicatorNode(var=0, var_val=0)
    ind2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind3 = CategoricalIndicatorNode(var=1, var_val=0)
    ind4 = CategoricalIndicatorNode(var=1, var_val=1)
    ind5 = CategoricalIndicatorNode(var=2, var_val=0)
    ind6 = CategoricalIndicatorNode(var=2, var_val=1)
    ind7 = CategoricalIndicatorNode(var=2, var_val=2)
    ind8 = CategoricalIndicatorNode(var=3, var_val=0)
    ind9 = CategoricalIndicatorNode(var=3, var_val=1)
    ind10 = CategoricalIndicatorNode(var=3, var_val=2)
    ind11 = CategoricalIndicatorNode(var=3, var_val=3)

    prod4 = ProductNode()
    prod5 = ProductNode()
    prod6 = ProductNode()
    prod7 = ProductNode()

    # linking nodes
    root.add_child(prod1, 0.3)
    root.add_child(prod2, 0.3)
    root.add_child(prod3, 0.4)

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(ind7)
    prod2.add_child(ind8)
    prod2.add_child(ind11)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    sum1.add_child(ind1, 0.3)
    sum1.add_child(ind2, 0.3)
    sum1.add_child(prod4, 0.4)

    sum2.add_child(ind2, 0.5)
    sum2.add_child(prod4, 0.2)
    sum2.add_child(prod5, 0.3)

    sum3.add_child(prod6, 0.5)
    sum3.add_child(prod7, 0.5)
    sum4.add_child(prod6, 0.5)
    sum4.add_child(prod7, 0.5)

    prod4.add_child(ind3)
    prod4.add_child(ind4)
    prod5.add_child(ind5)
    prod5.add_child(ind6)
    prod6.add_child(ind9)
    prod6.add_child(ind10)
    prod7.add_child(ind9)
    prod7.add_child(ind10)

    # building layers from nodes
    root_layer = SumLayerLinked([root])
    prod_layer = ProductLayerLinked([prod1, prod2, prod3])
    sum_layer = SumLayerLinked([sum1, sum2, sum3, sum4])
    aprod_layer = ProductLayerLinked([prod4, prod5, prod6, prod7])
    ind_layer = CategoricalIndicatorLayer(nodes=[
        ind1, ind2, ind3, ind4, ind5, ind6, ind7, ind8, ind9, ind10, ind11
    ])

    # creating the linked spn
    spn_linked = SpnLinked(
        input_layer=ind_layer,
        layers=[aprod_layer, sum_layer, prod_layer, root_layer])

    print(spn_linked)

    # converting to theano repr
    spn_theano = SpnFactory.linked_to_theano(spn_linked)
    print(spn_theano)

    # time for some inference comparison
    for instance in I:
        print('linked')
        res_l = spn_linked.eval(instance)
        print(res_l)
        print('theano')
        res_t = spn_theano.eval(instance)
        print(res_t)
        assert_array_almost_equal(res_l, res_t)
예제 #25
0
def test_linked_to_theano_categorical():
    vars = [2, 2, 3, 4]
    freqs = [{'var': 0, 'freqs': [1, 2]},
             {'var': 1, 'freqs': [2, 2]},
             {'var': 0, 'freqs': [3, 2]},
             {'var': 1, 'freqs': [0, 3]},
             {'var': 2, 'freqs': [1, 0, 2]},
             {'var': 3, 'freqs': [1, 2, 1, 2]},
             {'var': 3, 'freqs': [3, 4, 0, 1]}]

    # create input layer first
    input_layer = CategoricalSmoothedLayer(vars=vars,
                                           node_dicts=freqs)
    # get nodes
    ind_nodes = [node for node in input_layer.nodes()]

    root_node = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()

    sum3 = SumNode()
    sum4 = SumNode()

    # linking
    root_node.add_child(sum1)
    root_node.add_child(sum2)
    root_node.add_child(ind_nodes[0])
    root_node.add_child(ind_nodes[1])

    sum1.add_child(ind_nodes[2], 0.4)
    sum1.add_child(ind_nodes[3], 0.6)
    sum2.add_child(ind_nodes[3], 0.2)
    sum2.add_child(prod1, 0.5)
    sum2.add_child(prod2, 0.3)

    prod1.add_child(ind_nodes[4])
    prod1.add_child(sum3)
    prod1.add_child(sum4)
    prod2.add_child(sum3)
    prod2.add_child(sum4)

    sum3.add_child(ind_nodes[5], 0.5)
    sum3.add_child(ind_nodes[6], 0.5)
    sum4.add_child(ind_nodes[5], 0.4)
    sum4.add_child(ind_nodes[6], 0.6)

    # creating layers
    root_layer = ProductLayerLinked([root_node])
    sum_layer = SumLayerLinked([sum1, sum2])
    prod_layer = ProductLayerLinked([prod1, prod2])
    sum_layer2 = SumLayerLinked([sum3, sum4])

    # create the linked spn
    spn_linked = SpnLinked(input_layer=input_layer,
                           layers=[sum_layer2, prod_layer,
                                   sum_layer, root_layer])

    print(spn_linked)

    # converting to theano repr
    spn_theano = SpnFactory.linked_to_theano(spn_linked)
    print(spn_theano)

    # time for some inference comparison
    for instance in I:
        print('linked')
        res_l = spn_linked.eval(instance)
        print(res_l)
        print('theano')
        res_t = spn_theano.eval(instance)
        print(res_t)
        assert_array_almost_equal(res_l, res_t)
예제 #26
0
    def fit_structure(self, data, feature_sizes):
        """
        data is a numpy array of size {n_instances X n_features}
        feature_sizes is an array of integers representing feature ranges
        """

        #
        # resetting the data slice ids (just in case)
        DataSlice.reset_id_counter()

        tot_n_instances = data.shape[0]
        tot_n_features = data.shape[1]

        logging.info('Learning SPN structure on a (%d X %d) dataset',
                     tot_n_instances, tot_n_features)
        learn_start_t = perf_counter()

        #
        # a queue containing the data slices to process
        slices_to_process = deque()

        # a stack for building nodes
        building_stack = deque()

        # a dict to keep track of id->nodes
        node_id_assoc = {}

        # creating the first slice
        whole_slice = DataSlice.whole_slice(tot_n_instances, tot_n_features)
        slices_to_process.append(whole_slice)

        first_run = True

        #
        # iteratively process & split slices
        #
        while slices_to_process:

            # process a slice
            current_slice = slices_to_process.popleft()

            # pointers to the current data slice
            current_instances = current_slice.instance_ids
            current_features = current_slice.feature_ids
            current_id = current_slice.id

            n_instances = len(current_instances)
            n_features = len(current_features)

            logging.info('\n*** Processing slice %d (%d X %d)', current_id,
                         n_instances, n_features)
            logging.debug('\tinstances:%s\n\tfeatures:%s', current_instances,
                          current_features)

            #
            # is this a leaf node or we can split?
            if n_features == 1:
                logging.info('---> Adding a leaf (just one feature)')

                (feature_id, ) = current_features
                feature_size = feature_sizes[feature_id]

                # slicing from the original dataset
                slice_data_rows = data[current_instances, :]
                current_slice_data = slice_data_rows[:, current_features]

                # create the node
                leaf_node = CategoricalSmoothedNode(
                    var=feature_id,
                    var_values=feature_size,
                    data=current_slice_data,
                    instances=current_instances,
                    alpha=self._alpha)
                # print('lnvf', leaf_node._var_freqs)
                # storing links
                # input_nodes.append(leaf_node)
                leaf_node.id = current_id
                node_id_assoc[current_id] = leaf_node

                logging.debug('\tCreated Smooth Node %s', leaf_node)

            elif (n_instances <= self._min_instances_slice and n_features > 1):
                #
                # splitting the slice on each feature
                logging.info('---> Few instances (%d), decompose all features',
                             n_instances)
                #
                # shall put a cltree or
                if self._cltree_leaves:
                    logging.info('into a Chow-Liu tree')
                    #
                    # slicing data
                    slice_data_rows = data[current_instances, :]
                    current_slice_data = slice_data_rows[:, current_features]

                    current_feature_sizes = [
                        feature_sizes[i] for i in current_features
                    ]
                    #
                    # creating a Chow-Liu tree as leaf
                    leaf_node = CLTreeNode(vars=current_features,
                                           var_values=current_feature_sizes,
                                           data=current_slice_data,
                                           alpha=self._alpha)
                    #
                    # storing links
                    leaf_node.id = current_id
                    node_id_assoc[current_id] = leaf_node

                    logging.debug('\tCreated Chow-Liu Tree Node %s', leaf_node)

                elif self._kde and n_instances > 1:
                    estimate_kernel_density_spn(current_slice, feature_sizes,
                                                data, self._alpha,
                                                node_id_assoc, building_stack,
                                                slices_to_process)

                # elif n_instances == 1:  # FIXME: there is a bug here
                else:
                    current_slice, slices_to_process, building_stack, node_id_assoc = \
                        self.make_naive_factorization(current_slice,
                                                      slices_to_process,
                                                      building_stack,
                                                      node_id_assoc)
            else:

                #
                # slicing from the original dataset
                slice_data_rows = data[current_instances, :]
                current_slice_data = slice_data_rows[:, current_features]

                split_on_features = False
                #
                # first run is a split on rows
                if first_run:
                    logging.info('-- FIRST RUN --')
                    first_run = False
                else:
                    #
                    # try clustering on cols
                    # logging.debug('...trying to split on columns')
                    split_start_t = perf_counter()
                    print(data.shape)
                    dependent_features, other_features = greedy_feature_split(
                        data, current_slice, feature_sizes, self._g_factor,
                        self._rand_gen)
                    split_end_t = perf_counter()
                    logging.info('...tried to split on columns in {}'.format(
                        split_end_t - split_start_t))
                    if len(other_features) > 0:
                        split_on_features = True
                #
                # have dependent components been found?
                if split_on_features:
                    #
                    # splitting on columns
                    logging.info(
                        '---> Splitting on features' +
                        ' {} -> ({}, {})'.format(len(current_features),
                                                 len(dependent_features),
                                                 len(other_features)))

                    #
                    # creating two new data slices and putting them on queue
                    first_slice = DataSlice(current_instances,
                                            dependent_features)
                    second_slice = DataSlice(current_instances, other_features)
                    slices_to_process.append(first_slice)
                    slices_to_process.append(second_slice)

                    children_ids = [first_slice.id, second_slice.id]

                    #
                    # storing link parent children
                    current_slice.type = ProductNode
                    building_stack.append(current_slice)
                    current_slice.add_child(first_slice)
                    current_slice.add_child(second_slice)

                    #
                    # creating product node
                    prod_node = ProductNode(
                        var_scope=frozenset(current_features))
                    prod_node.id = current_id
                    node_id_assoc[current_id] = prod_node
                    logging.debug('\tCreated Prod Node %s (with children %s)',
                                  prod_node, children_ids)

                else:
                    #
                    # clustering on rows
                    logging.info('---> Splitting on rows')

                    #
                    # at most n_rows clusters, for sklearn
                    k_row_clusters = min(self._n_cluster_splits,
                                         n_instances - 1)

                    clustering = cluster_rows(
                        data,
                        current_slice,
                        n_clusters=k_row_clusters,
                        cluster_method=self._row_cluster_method,
                        n_iters=self._n_iters,
                        n_restarts=self._n_restarts,
                        cluster_penalty=self._cluster_penalty,
                        rand_gen=self._rand_gen,
                        sklearn_args=self._sklearn_args)

                    if len(clustering) < 2:
                        logging.info('\n\n\nLess than 2 clusters\n\n (%d)',
                                     len(clustering))

                        logging.info('forcing a naive factorization')
                        current_slice, slices_to_process, building_stack, node_id_assoc = \
                            self.make_naive_factorization(current_slice,
                                                          slices_to_process,
                                                          building_stack,
                                                          node_id_assoc)

                    else:
                        # logging.debug('obtained clustering %s', clustering)
                        logging.info('clustered into %d parts (min %d)',
                                     len(clustering), k_row_clusters)
                        # splitting
                        cluster_slices = [
                            DataSlice(cluster, current_features)
                            for cluster in clustering
                        ]
                        cluster_slices_ids = [
                            slice.id for slice in cluster_slices
                        ]

                        # cluster_prior = 5.0
                        # cluster_weights = [(slice.n_instances() + cluster_prior) /
                        #                    (n_instances + cluster_prior * len(cluster_slices))
                        #                    for slice in cluster_slices]
                        cluster_weights = [
                            slice.n_instances() / n_instances
                            for slice in cluster_slices
                        ]

                        #
                        # appending for processing
                        slices_to_process.extend(cluster_slices)

                        #
                        # storing links
                        # current_slice.children = cluster_slices_ids
                        # current_slice.weights = cluster_weights
                        current_slice.type = SumNode
                        building_stack.append(current_slice)
                        for child_slice, child_weight in zip(
                                cluster_slices, cluster_weights):
                            current_slice.add_child(child_slice, child_weight)

                        #
                        # building a sum node
                        SCOPES_DICT[frozenset(current_features)] += 1
                        sum_node = SumNode(
                            var_scope=frozenset(current_features))
                        sum_node.id = current_id
                        node_id_assoc[current_id] = sum_node
                        logging.debug(
                            '\tCreated Sum Node %s (with children %s)',
                            sum_node, cluster_slices_ids)

        learn_end_t = perf_counter()

        logging.info('\n\n\tStructure learned in %f secs',
                     (learn_end_t - learn_start_t))

        #
        # linking the spn graph (parent -> children)
        #
        logging.info('===> Building tree')

        link_start_t = perf_counter()
        root_build_node = building_stack[0]
        root_node = node_id_assoc[root_build_node.id]
        logging.debug('root node: %s', root_node)

        root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc,
                                                      building_stack)
        link_end_t = perf_counter()
        logging.info('\tLinked the spn in %f secs (root_node %s)',
                     (link_end_t - link_start_t), root_node)

        #
        # building layers
        #
        logging.info('===> Layering spn')
        layer_start_t = perf_counter()
        spn = SpnFactory.layered_linked_spn(root_node)
        layer_end_t = perf_counter()
        logging.info('\tLayered the spn in %f secs',
                     (layer_end_t - layer_start_t))

        logging.info('\nLearned SPN\n\n%s', spn.stats())
        #logging.info('%s', SCOPES_DICT.most_common(30))

        return spn
예제 #27
0
def test_categorical_to_indicator_input_layer():
    #
    # creating all the data slices
    # the slicing is a fake stub
    # rows = 5
    # cols = 5
    var_1 = 0
    values_1 = 2
    var_2 = 1
    values_2 = 3
    var_3 = 2
    values_3 = 4

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var_1, values_1)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    leaf_15 = CategoricalSmoothedNode(var_2, values_2)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var_3, values_3)
    leaf_13.id = 13

    leaf_14 = CategoricalSmoothedNode(var_1, values_1)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    node_9 = ProductNode()
    node_9.id = 9

    leaf_16 = CategoricalSmoothedNode(var_2, values_2)
    leaf_16.id = 16

    leaf_17 = CategoricalSmoothedNode(var_3, values_3)
    leaf_17.id = 17

    node_9.add_child(leaf_16)
    node_9.add_child(leaf_17)

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var_2, values_2)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var_2, values_2)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(node_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var_1, values_1)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var_3, values_3)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CategoricalSmoothedNode(var_1, values_1)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var_3, values_3)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12

    #
    # changing input layer
    spn = linked_categorical_input_to_indicators(spn)

    print('Changed input layer to indicator variables')
    print(spn)
예제 #28
0
def test_layered_pruned_linked_spn_cltree():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    vars = [2, 3]
    var_values = [2, 2]
    s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]])

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var,
                                     values)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    #
    # this is a cltree
    leaf_15 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var,
                                      values)
    leaf_13.id = 13

    leaf_14 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    leaf_9 = CLTreeNode(vars=vars,
                        var_values=var_values,
                        data=s_data)
    leaf_9.id = 9

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var,
                                      values)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var,
                                      values)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(leaf_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var,
                                      values)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var,
                                      values)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var,
                                      values)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    print('Added nodes')

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 4
        elif i == 2:
            assert layer.n_nodes() == 10
예제 #29
0
def test_layered_linked_spn():
    # creating single nodes
    # this code is replicated TODO: make a function
    root = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    ind1 = CategoricalIndicatorNode(var=0, var_val=0)
    ind2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind3 = CategoricalIndicatorNode(var=1, var_val=0)
    ind4 = CategoricalIndicatorNode(var=1, var_val=1)
    ind5 = CategoricalIndicatorNode(var=2, var_val=0)
    ind6 = CategoricalIndicatorNode(var=2, var_val=1)
    ind7 = CategoricalIndicatorNode(var=2, var_val=2)
    ind8 = CategoricalIndicatorNode(var=3, var_val=0)
    ind9 = CategoricalIndicatorNode(var=3, var_val=1)
    ind10 = CategoricalIndicatorNode(var=3, var_val=2)
    ind11 = CategoricalIndicatorNode(var=3, var_val=3)

    prod4 = ProductNode()
    prod5 = ProductNode()
    prod6 = ProductNode()
    prod7 = ProductNode()

    # linking nodes
    root.add_child(prod1, 0.3)
    root. add_child(prod2, 0.3)
    root.add_child(prod3, 0.4)

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(ind7)
    prod2.add_child(ind8)
    prod2.add_child(ind11)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    sum1.add_child(ind1, 0.3)
    sum1.add_child(ind2, 0.3)
    sum1.add_child(prod4, 0.4)

    sum2.add_child(ind2, 0.5)
    sum2.add_child(prod4, 0.2)
    sum2.add_child(prod5, 0.3)

    sum3.add_child(prod6, 0.5)
    sum3.add_child(prod7, 0.5)
    sum4.add_child(prod6, 0.5)
    sum4.add_child(prod7, 0.5)

    prod4.add_child(ind3)
    prod4.add_child(ind4)
    prod5.add_child(ind5)
    prod5.add_child(ind6)
    prod6.add_child(ind9)
    prod6.add_child(ind10)
    prod7.add_child(ind9)
    prod7.add_child(ind10)

    spn = SpnFactory.layered_linked_spn(root)

    print(spn)
    print(spn.stats())
예제 #30
0
def test_build_theanok_spn_from_block_linked_top_rand():

    data = numpy.array([[1, 1, 0, 1, 0], [0, 1, 1, 1, 1], [1, 0, 0, 0, 0],
                        [1, 0, 1, 0, 1], [0, 1, 0, 1, 1], [1, 0, 0, 0, 0],
                        [1, 0, 1, 1, 1], [0, 0, 0, 0, 1]])

    # number small parameters
    n_levels = 10
    # n_levels = 3
    vars = [2, 2, 2, 2, 2]
    n_max_children = 2
    n_scope_children = 3
    max_scope_split = 2
    merge_prob = 0.5

    ind_data = dataset.one_hot_encoding(data, feature_values=vars)
    # log_ind_data = numpy.clip(numpy.log(ind_data), LOG_ZERO, 0)

    # building it
    print('creating random spn')
    rand_gen = random.Random(789)

    #
    # doing this for more than once
    n_times = 10
    for i in range(n_times):

        print('\n\n******* Trial {}/{} *******\n'.format(i + 1, n_times))
        spn = SpnFactory.linked_random_spn_top_down(vars,
                                                    n_levels,
                                                    n_max_children,
                                                    n_scope_children,
                                                    max_scope_split,
                                                    merge_prob,
                                                    rand_gen=rand_gen)

        # printing for comparison
        print(spn)
        print(spn.stats())
        assert spn.is_valid()

        max_nodes = 2
        # # translating to theanok representation
        theano_spn = build_theanok_spn_from_block_linked_top(
            spn, ind_data.shape[1], vars, max_n_edges_layer=max_nodes)
        # for l in theano_spn.layers:
        #     print(l.id)
        #     l.build()
        #     l.compile()

        print(theano_spn)

        res = spn.eval(data.T)
        print('Linked Spn res', res)
        # log_data = numpy.clip(numpy.log(ind_data), LOG_ZERO, 0)
        # t_res = theano_spn.evaluate(log_data)

        t_res = theano_spn.evaluate(ind_data)
        print('Theano Spn res', t_res)

        assert_array_almost_equal(numpy.array(res), numpy.array(t_res))

        #
        # evaluate batch
        batch_preds = evaluate_on_dataset_batch(theano_spn, ind_data)
        print('Theano batch res', batch_preds)
        assert_array_almost_equal(
            numpy.array(res).flatten(), numpy.array(batch_preds))

        batch_size = 3
        minibatch_preds = evaluate_on_dataset_batch(theano_spn, ind_data,
                                                    batch_size)
        print('Theano mini batch res', minibatch_preds)
        assert_array_almost_equal(
            numpy.array(res).flatten(), numpy.array(minibatch_preds))
예제 #31
0
def test_linked_to_theano_indicator():
    # creating single nodes
    root = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    ind1 = CategoricalIndicatorNode(var=0, var_val=0)
    ind2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind3 = CategoricalIndicatorNode(var=1, var_val=0)
    ind4 = CategoricalIndicatorNode(var=1, var_val=1)
    ind5 = CategoricalIndicatorNode(var=2, var_val=0)
    ind6 = CategoricalIndicatorNode(var=2, var_val=1)
    ind7 = CategoricalIndicatorNode(var=2, var_val=2)
    ind8 = CategoricalIndicatorNode(var=3, var_val=0)
    ind9 = CategoricalIndicatorNode(var=3, var_val=1)
    ind10 = CategoricalIndicatorNode(var=3, var_val=2)
    ind11 = CategoricalIndicatorNode(var=3, var_val=3)

    prod4 = ProductNode()
    prod5 = ProductNode()
    prod6 = ProductNode()
    prod7 = ProductNode()

    # linking nodes
    root.add_child(prod1, 0.3)
    root. add_child(prod2, 0.3)
    root.add_child(prod3, 0.4)

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(ind7)
    prod2.add_child(ind8)
    prod2.add_child(ind11)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    sum1.add_child(ind1, 0.3)
    sum1.add_child(ind2, 0.3)
    sum1.add_child(prod4, 0.4)

    sum2.add_child(ind2, 0.5)
    sum2.add_child(prod4, 0.2)
    sum2.add_child(prod5, 0.3)

    sum3.add_child(prod6, 0.5)
    sum3.add_child(prod7, 0.5)
    sum4.add_child(prod6, 0.5)
    sum4.add_child(prod7, 0.5)

    prod4.add_child(ind3)
    prod4.add_child(ind4)
    prod5.add_child(ind5)
    prod5.add_child(ind6)
    prod6.add_child(ind9)
    prod6.add_child(ind10)
    prod7.add_child(ind9)
    prod7.add_child(ind10)

    # building layers from nodes
    root_layer = SumLayerLinked([root])
    prod_layer = ProductLayerLinked([prod1, prod2, prod3])
    sum_layer = SumLayerLinked([sum1, sum2, sum3, sum4])
    aprod_layer = ProductLayerLinked([prod4, prod5, prod6, prod7])
    ind_layer = CategoricalIndicatorLayer(nodes=[ind1, ind2,
                                                 ind3, ind4,
                                                 ind5, ind6,
                                                 ind7, ind8,
                                                 ind9, ind10,
                                                 ind11])

    # creating the linked spn
    spn_linked = SpnLinked(input_layer=ind_layer,
                           layers=[aprod_layer,
                                   sum_layer,
                                   prod_layer,
                                   root_layer])

    print(spn_linked)

    # converting to theano repr
    spn_theano = SpnFactory.linked_to_theano(spn_linked)
    print(spn_theano)

    # time for some inference comparison
    for instance in I:
        print('linked')
        res_l = spn_linked.eval(instance)
        print(res_l)
        print('theano')
        res_t = spn_theano.eval(instance)
        print(res_t)
        assert_array_almost_equal(res_l, res_t)
예제 #32
0
def test_layered_linked_spn():
    # creating single nodes
    # this code is replicated TODO: make a function
    root = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    ind1 = CategoricalIndicatorNode(var=0, var_val=0)
    ind2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind3 = CategoricalIndicatorNode(var=1, var_val=0)
    ind4 = CategoricalIndicatorNode(var=1, var_val=1)
    ind5 = CategoricalIndicatorNode(var=2, var_val=0)
    ind6 = CategoricalIndicatorNode(var=2, var_val=1)
    ind7 = CategoricalIndicatorNode(var=2, var_val=2)
    ind8 = CategoricalIndicatorNode(var=3, var_val=0)
    ind9 = CategoricalIndicatorNode(var=3, var_val=1)
    ind10 = CategoricalIndicatorNode(var=3, var_val=2)
    ind11 = CategoricalIndicatorNode(var=3, var_val=3)

    prod4 = ProductNode()
    prod5 = ProductNode()
    prod6 = ProductNode()
    prod7 = ProductNode()

    # linking nodes
    root.add_child(prod1, 0.3)
    root.add_child(prod2, 0.3)
    root.add_child(prod3, 0.4)

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(ind7)
    prod2.add_child(ind8)
    prod2.add_child(ind11)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    sum1.add_child(ind1, 0.3)
    sum1.add_child(ind2, 0.3)
    sum1.add_child(prod4, 0.4)

    sum2.add_child(ind2, 0.5)
    sum2.add_child(prod4, 0.2)
    sum2.add_child(prod5, 0.3)

    sum3.add_child(prod6, 0.5)
    sum3.add_child(prod7, 0.5)
    sum4.add_child(prod6, 0.5)
    sum4.add_child(prod7, 0.5)

    prod4.add_child(ind3)
    prod4.add_child(ind4)
    prod5.add_child(ind5)
    prod5.add_child(ind6)
    prod6.add_child(ind9)
    prod6.add_child(ind10)
    prod7.add_child(ind9)
    prod7.add_child(ind10)

    spn = SpnFactory.layered_linked_spn(root)

    print(spn)
    print(spn.stats())
예제 #33
0
def test_pruned_spn_from_slices():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    node_assoc = {}
    building_stack = deque()

    slice_1 = DataSlice.whole_slice(rows, cols)
    slice_1.type = SumNode
    node_1 = SumNode()
    node_1.id = slice_1.id
    node_assoc[node_1.id] = node_1
    building_stack.append(slice_1)

    slice_2 = DataSlice.whole_slice(rows, cols)
    slice_2.type = ProductNode
    node_2 = ProductNode()
    node_2.id = slice_2.id
    node_assoc[node_2.id] = node_2
    building_stack.append(slice_2)

    slice_3 = DataSlice.whole_slice(rows, cols)
    slice_3.type = SumNode
    node_3 = SumNode()
    node_3.id = slice_3.id
    node_assoc[node_3.id] = node_3
    building_stack.append(slice_3)

    # adding first level
    slice_1.add_child(slice_2, 0.8)
    slice_1.add_child(slice_3, 0.2)

    slice_4 = DataSlice.whole_slice(rows, cols)
    slice_4.type = ProductNode
    node_4 = ProductNode()
    node_4.id = slice_4.id
    node_assoc[node_4.id] = node_4
    building_stack.append(slice_4)

    leaf_5 = CategoricalSmoothedNode(var,
                                     values)
    slice_5 = DataSlice.whole_slice(rows, cols)
    leaf_5.id = slice_5.id
    node_assoc[leaf_5.id] = leaf_5
    # not adding the slice to the stack

    slice_2.add_child(slice_4)
    slice_2.add_child(slice_5)

    slice_6 = DataSlice.whole_slice(rows, cols)
    slice_6.type = SumNode
    node_6 = SumNode()
    node_6.id = slice_6.id
    node_assoc[node_6.id] = node_6
    building_stack.append(slice_6)

    slice_7 = DataSlice.whole_slice(rows, cols)
    slice_7.type = SumNode
    node_7 = SumNode()
    node_7.id = slice_7.id
    node_assoc[node_7.id] = node_7
    building_stack.append(slice_7)

    slice_3.add_child(slice_6, 0.4)
    slice_3.add_child(slice_7, 0.6)

    slice_8 = DataSlice.whole_slice(rows, cols)
    slice_8.type = ProductNode
    node_8 = ProductNode()
    node_8.id = slice_8.id
    node_assoc[node_8.id] = node_8
    building_stack.append(slice_8)

    leaf_15 = CategoricalSmoothedNode(var,
                                      values)
    slice_15 = DataSlice.whole_slice(rows, cols)
    leaf_15.id = slice_15.id
    node_assoc[leaf_15.id] = leaf_15

    slice_4.add_child(slice_8)
    slice_4.add_child(slice_15)

    leaf_13 = CategoricalSmoothedNode(var,
                                      values)
    slice_13 = DataSlice.whole_slice(rows, cols)
    leaf_13.id = slice_13.id
    node_assoc[leaf_13.id] = leaf_13

    leaf_14 = CategoricalSmoothedNode(var,
                                      values)
    slice_14 = DataSlice.whole_slice(rows, cols)
    leaf_14.id = slice_14.id
    node_assoc[leaf_14.id] = leaf_14

    slice_8.add_child(slice_13)
    slice_8.add_child(slice_14)

    slice_9 = DataSlice.whole_slice(rows, cols)
    slice_9.type = ProductNode
    node_9 = ProductNode()
    node_9.id = slice_9.id
    node_assoc[node_9.id] = node_9
    building_stack.append(slice_9)

    leaf_16 = CategoricalSmoothedNode(var,
                                      values)
    slice_16 = DataSlice.whole_slice(rows, cols)
    leaf_16.id = slice_16.id
    node_assoc[leaf_16.id] = leaf_16

    leaf_17 = CategoricalSmoothedNode(var,
                                      values)
    slice_17 = DataSlice.whole_slice(rows, cols)
    leaf_17.id = slice_17.id
    node_assoc[leaf_17.id] = leaf_17

    slice_9.add_child(slice_16)
    slice_9.add_child(slice_17)

    slice_10 = DataSlice.whole_slice(rows, cols)
    slice_10.type = ProductNode
    node_10 = ProductNode()
    node_10.id = slice_10.id
    node_assoc[node_10.id] = node_10
    building_stack.append(slice_10)

    leaf_18 = CategoricalSmoothedNode(var,
                                      values)
    slice_18 = DataSlice.whole_slice(rows, cols)
    leaf_18.id = slice_18.id
    node_assoc[leaf_18.id] = leaf_18

    leaf_19 = CategoricalSmoothedNode(var,
                                      values)
    slice_19 = DataSlice.whole_slice(rows, cols)
    leaf_19.id = slice_19.id
    node_assoc[leaf_19.id] = leaf_19

    slice_10.add_child(slice_18)
    slice_10.add_child(slice_19)

    slice_6.add_child(slice_9, 0.1)
    slice_6.add_child(slice_10, 0.9)

    slice_11 = DataSlice.whole_slice(rows, cols)
    slice_11.type = ProductNode
    node_11 = ProductNode()
    node_11.id = slice_11.id
    node_assoc[node_11.id] = node_11
    building_stack.append(slice_11)

    leaf_20 = CategoricalSmoothedNode(var,
                                      values)
    slice_20 = DataSlice.whole_slice(rows, cols)
    leaf_20.id = slice_20.id
    node_assoc[leaf_20.id] = leaf_20

    leaf_21 = CategoricalSmoothedNode(var,
                                      values)
    slice_21 = DataSlice.whole_slice(rows, cols)
    leaf_21.id = slice_21.id
    node_assoc[leaf_21.id] = leaf_21

    slice_11.add_child(slice_20)
    slice_11.add_child(slice_21)

    slice_12 = DataSlice.whole_slice(rows, cols)
    slice_12.type = ProductNode
    node_12 = ProductNode()
    node_12.id = slice_12.id
    node_assoc[node_12.id] = node_12
    building_stack.append(slice_12)

    leaf_22 = CategoricalSmoothedNode(var,
                                      values)
    slice_22 = DataSlice.whole_slice(rows, cols)
    leaf_22.id = slice_22.id
    node_assoc[leaf_22.id] = leaf_22

    leaf_23 = CategoricalSmoothedNode(var,
                                      values)
    slice_23 = DataSlice.whole_slice(rows, cols)
    leaf_23.id = slice_23.id
    node_assoc[leaf_23.id] = leaf_23

    slice_12.add_child(slice_22)
    slice_12.add_child(slice_23)

    slice_7.add_child(slice_11, 0.2)
    slice_7.add_child(slice_12, 0.7)

    root_node = SpnFactory.pruned_spn_from_slices(node_assoc,
                                                  building_stack)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12
예제 #34
0
def test_pruned_spn_from_slices():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    node_assoc = {}
    building_stack = deque()

    slice_1 = DataSlice.whole_slice(rows, cols)
    slice_1.type = SumNode
    node_1 = SumNode()
    node_1.id = slice_1.id
    node_assoc[node_1.id] = node_1
    building_stack.append(slice_1)

    slice_2 = DataSlice.whole_slice(rows, cols)
    slice_2.type = ProductNode
    node_2 = ProductNode()
    node_2.id = slice_2.id
    node_assoc[node_2.id] = node_2
    building_stack.append(slice_2)

    slice_3 = DataSlice.whole_slice(rows, cols)
    slice_3.type = SumNode
    node_3 = SumNode()
    node_3.id = slice_3.id
    node_assoc[node_3.id] = node_3
    building_stack.append(slice_3)

    # adding first level
    slice_1.add_child(slice_2, 0.8)
    slice_1.add_child(slice_3, 0.2)

    slice_4 = DataSlice.whole_slice(rows, cols)
    slice_4.type = ProductNode
    node_4 = ProductNode()
    node_4.id = slice_4.id
    node_assoc[node_4.id] = node_4
    building_stack.append(slice_4)

    leaf_5 = CategoricalSmoothedNode(var, values)
    slice_5 = DataSlice.whole_slice(rows, cols)
    leaf_5.id = slice_5.id
    node_assoc[leaf_5.id] = leaf_5
    # not adding the slice to the stack

    slice_2.add_child(slice_4)
    slice_2.add_child(slice_5)

    slice_6 = DataSlice.whole_slice(rows, cols)
    slice_6.type = SumNode
    node_6 = SumNode()
    node_6.id = slice_6.id
    node_assoc[node_6.id] = node_6
    building_stack.append(slice_6)

    slice_7 = DataSlice.whole_slice(rows, cols)
    slice_7.type = SumNode
    node_7 = SumNode()
    node_7.id = slice_7.id
    node_assoc[node_7.id] = node_7
    building_stack.append(slice_7)

    slice_3.add_child(slice_6, 0.4)
    slice_3.add_child(slice_7, 0.6)

    slice_8 = DataSlice.whole_slice(rows, cols)
    slice_8.type = ProductNode
    node_8 = ProductNode()
    node_8.id = slice_8.id
    node_assoc[node_8.id] = node_8
    building_stack.append(slice_8)

    leaf_15 = CategoricalSmoothedNode(var, values)
    slice_15 = DataSlice.whole_slice(rows, cols)
    leaf_15.id = slice_15.id
    node_assoc[leaf_15.id] = leaf_15

    slice_4.add_child(slice_8)
    slice_4.add_child(slice_15)

    leaf_13 = CategoricalSmoothedNode(var, values)
    slice_13 = DataSlice.whole_slice(rows, cols)
    leaf_13.id = slice_13.id
    node_assoc[leaf_13.id] = leaf_13

    leaf_14 = CategoricalSmoothedNode(var, values)
    slice_14 = DataSlice.whole_slice(rows, cols)
    leaf_14.id = slice_14.id
    node_assoc[leaf_14.id] = leaf_14

    slice_8.add_child(slice_13)
    slice_8.add_child(slice_14)

    slice_9 = DataSlice.whole_slice(rows, cols)
    slice_9.type = ProductNode
    node_9 = ProductNode()
    node_9.id = slice_9.id
    node_assoc[node_9.id] = node_9
    building_stack.append(slice_9)

    leaf_16 = CategoricalSmoothedNode(var, values)
    slice_16 = DataSlice.whole_slice(rows, cols)
    leaf_16.id = slice_16.id
    node_assoc[leaf_16.id] = leaf_16

    leaf_17 = CategoricalSmoothedNode(var, values)
    slice_17 = DataSlice.whole_slice(rows, cols)
    leaf_17.id = slice_17.id
    node_assoc[leaf_17.id] = leaf_17

    slice_9.add_child(slice_16)
    slice_9.add_child(slice_17)

    slice_10 = DataSlice.whole_slice(rows, cols)
    slice_10.type = ProductNode
    node_10 = ProductNode()
    node_10.id = slice_10.id
    node_assoc[node_10.id] = node_10
    building_stack.append(slice_10)

    leaf_18 = CategoricalSmoothedNode(var, values)
    slice_18 = DataSlice.whole_slice(rows, cols)
    leaf_18.id = slice_18.id
    node_assoc[leaf_18.id] = leaf_18

    leaf_19 = CategoricalSmoothedNode(var, values)
    slice_19 = DataSlice.whole_slice(rows, cols)
    leaf_19.id = slice_19.id
    node_assoc[leaf_19.id] = leaf_19

    slice_10.add_child(slice_18)
    slice_10.add_child(slice_19)

    slice_6.add_child(slice_9, 0.1)
    slice_6.add_child(slice_10, 0.9)

    slice_11 = DataSlice.whole_slice(rows, cols)
    slice_11.type = ProductNode
    node_11 = ProductNode()
    node_11.id = slice_11.id
    node_assoc[node_11.id] = node_11
    building_stack.append(slice_11)

    leaf_20 = CategoricalSmoothedNode(var, values)
    slice_20 = DataSlice.whole_slice(rows, cols)
    leaf_20.id = slice_20.id
    node_assoc[leaf_20.id] = leaf_20

    leaf_21 = CategoricalSmoothedNode(var, values)
    slice_21 = DataSlice.whole_slice(rows, cols)
    leaf_21.id = slice_21.id
    node_assoc[leaf_21.id] = leaf_21

    slice_11.add_child(slice_20)
    slice_11.add_child(slice_21)

    slice_12 = DataSlice.whole_slice(rows, cols)
    slice_12.type = ProductNode
    node_12 = ProductNode()
    node_12.id = slice_12.id
    node_assoc[node_12.id] = node_12
    building_stack.append(slice_12)

    leaf_22 = CategoricalSmoothedNode(var, values)
    slice_22 = DataSlice.whole_slice(rows, cols)
    leaf_22.id = slice_22.id
    node_assoc[leaf_22.id] = leaf_22

    leaf_23 = CategoricalSmoothedNode(var, values)
    slice_23 = DataSlice.whole_slice(rows, cols)
    leaf_23.id = slice_23.id
    node_assoc[leaf_23.id] = leaf_23

    slice_12.add_child(slice_22)
    slice_12.add_child(slice_23)

    slice_7.add_child(slice_11, 0.2)
    slice_7.add_child(slice_12, 0.7)

    root_node = SpnFactory.pruned_spn_from_slices(node_assoc, building_stack)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12
예제 #35
0
    def fit_structure(self, data):

        #
        # a queue containing the data slices to process
        slices_to_process = deque()

        # a stack for building nodes
        building_stack = deque()

        # a dict to keep track of id->nodes
        node_id_assoc = {}

        # creating the first slice
        whole_slice = DataSlice.whole_slice(data.shape[0], data.shape[1])
        slices_to_process.append(whole_slice)

        cluster_first = self._cluster_first

        #
        # iteratively process & split slices
        #
        while slices_to_process:

            # process a slice
            current_slice = slices_to_process.popleft()

            # pointers to the current data slice
            current_instances = current_slice.instance_ids
            current_features = current_slice.feature_ids
            current_id = current_slice.id

            n_features = len(current_features)

            #             if n_features > 1:
            # #                 # print("removing Zeros")
            #                 datarowsIdx = numpy.sum(data[current_instances, :][:, current_features], 1) > 0
            #                 if not any(datarowsIdx):
            #                     datarowsIdx[0] = True
            #                 current_instances = current_slice.instance_ids[datarowsIdx]

            n_instances = len(current_instances)

            #             if n_instances == 0:
            #                 #too strong cutting the zeroes
            #                 current_instances = [current_slice.instance_ids[0]]
            #                 n_instances = len(current_instances)

            slice_data_rows = data[current_instances, :]
            current_slice_data = slice_data_rows[:, current_features]

            # is this a leaf node or we can split?
            if n_features == 1 and (current_slice.doNotCluster or
                                    n_instances <= self._min_instances_slice):

                (feature_id, ) = current_features

                if self.family == "poisson":
                    leaf_node = PoissonNode(data, current_instances,
                                            current_features)
                elif self.family == "gaussian":
                    leaf_node = GaussianNode(data, current_instances,
                                             current_features)

                # storing links
                # input_nodes.append(leaf_node)
                leaf_node.id = current_id
                node_id_assoc[current_id] = leaf_node

            # elif (current_slice_data.shape[0] < self._min_instances_slice):
            # elif ( (n_instances <= self._min_instances_slice and n_features > 1) and current_slice_data.shape[0]  < self._min_instances_slice):
            # elif ((n_instances <= self._min_instances_slice and n_features > 1)):
            elif n_features > 1 and (current_slice.doNotCluster or
                                     n_instances <= self._min_instances_slice):

                # print('into naive factorization')
                child_slices = [
                    DataSlice(current_instances, [feature_id])
                    for feature_id in current_features
                ]
                slices_to_process.extend(child_slices)

                #children_ids = [child.id for child in child_slices]

                for child_slice in child_slices:
                    child_slice.doNotCluster = current_slice.doNotCluster
                    current_slice.add_child(child_slice)
                current_slice.type = ProductNode
                building_stack.append(current_slice)

                prod_node = ProductNode(data, current_instances,
                                        current_features)
                prod_node.id = current_id

                node_id_assoc[current_id] = prod_node

            else:

                split_on_features = False

                # first_run = False
                #
                # first run is a split on rows
                if n_features == 1 or cluster_first:
                    cluster_first = False
                else:

                    if self._ind_test_method == "pairwise_treeglm" or self._ind_test_method == "subsample":

                        fcdata = current_slice_data

                        if self._ind_test_method == "subsample":
                            #sampled_rows = 2000
                            #sampled_rows = math.floor(current_slice_data.shape[0]*10/100)
                            sampled_rows = self._sub_sample_rows
                            if sampled_rows < current_slice_data.shape[0]:
                                fcdata = current_slice_data[
                                    numpy.random.choice(
                                        current_slice_data.shape[0],
                                        sampled_rows,
                                        replace=False)]
                            else:
                                fcdata = current_slice_data

                        #Using R
                        #from pdn.independenceptest import getIndependentGroups
                        #feature_clusters = retrieve_clustering(getIndependentGroups(fcdata, alpha=self._alpha, family=self.family), current_features)
                        feature_clusters = retrieve_clustering(
                            getIndependentGroupsStabilityTest(
                                fcdata, alpha=self._alpha), current_features)
                    elif self._ind_test_method == "KMeans":

                        feature_clusters = retrieve_clustering(
                            cluster_rows(
                                (data[current_instances, :][:,
                                                            current_features]
                                 ).T,
                                n_clusters=2,
                                cluster_method=self._row_cluster_method,
                                n_iters=self._n_iters,
                                n_restarts=self._n_restarts,
                                cluster_prep_method="sqrt",
                                cluster_penalty=self._cluster_penalty,
                                rand_gen=self._rand_gen,
                                sklearn_args=self._sklearn_args),
                            current_instances)

                    split_on_features = len(feature_clusters) > 1

                #
                # have dependent components been found?
                if split_on_features:
                    #
                    # splitting on columns
                    # print('---> Splitting on features')
                    # print(feature_clusters)

                    slices = [
                        DataSlice(current_instances, cluster)
                        for cluster in feature_clusters
                    ]

                    slices_to_process.extend(slices)

                    current_slice.type = ProductNode
                    building_stack.append(current_slice)
                    for child_slice in slices:
                        current_slice.add_child(child_slice)

                    prod_node = ProductNode(data, current_instances,
                                            current_features)
                    prod_node.id = current_id
                    node_id_assoc[current_id] = prod_node

                else:
                    # print('---> Splitting on rows')

                    k_row_clusters = min(self._n_cluster_splits,
                                         n_instances - 1)

                    if n_features == 1:
                        # do one kmeans run with K large enough to split into N min instances
                        k_row_clusters = math.floor(
                            n_instances / self._min_instances_slice) + 1
                        k_row_clusters = min(k_row_clusters, n_instances - 1)

                    clustering = retrieve_clustering(
                        cluster_rows(
                            data[current_instances, :][:, current_features],
                            n_clusters=k_row_clusters,
                            cluster_method=self._row_cluster_method,
                            n_iters=self._n_iters,
                            n_restarts=self._n_restarts,
                            cluster_prep_method=self._cluster_prep_method,
                            cluster_penalty=self._cluster_penalty,
                            rand_gen=self._rand_gen,
                            sklearn_args=self._sklearn_args),
                        current_instances)

                    cluster_slices = [
                        DataSlice(cluster, current_features)
                        for cluster in clustering
                    ]

                    if len(clustering) < k_row_clusters:
                        for cluster_slice in cluster_slices:
                            cluster_slice.doNotCluster = True

                    n_instances_clusters = sum(
                        [len(cluster) for cluster in clustering])
                    cluster_weights = [
                        len(cluster) / n_instances_clusters
                        for cluster in clustering
                    ]

                    slices_to_process.extend(cluster_slices)

                    current_slice.type = SumNode
                    building_stack.append(current_slice)
                    for child_slice, child_weight in zip(
                            cluster_slices, cluster_weights):
                        current_slice.add_child(child_slice, child_weight)

                    sum_node = SumNode(data, current_instances,
                                       current_features)
                    sum_node.id = current_id
                    node_id_assoc[current_id] = sum_node

        root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc,
                                                      building_stack, True)

        spn = SpnFactory.layered_linked_spn(root_node, data, self.config)

        return spn
예제 #36
0
def test_layered_pruned_linked_spn_cltree():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    vars = [2, 3]
    var_values = [2, 2]
    s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]])

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var, values)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    #
    # this is a cltree
    leaf_15 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var, values)
    leaf_13.id = 13

    leaf_14 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    leaf_9 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_9.id = 9

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var, values)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var, values)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(leaf_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var, values)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var, values)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var, values)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    print('Added nodes')

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 4
        elif i == 2:
            assert layer.n_nodes() == 10