def create_disj(data, scope, assignments, alpha):

    unq_data, counts = np.unique(data, axis=0, return_counts=True)
    probs = np.zeros(assignments.shape[0])
    for i in range(assignments.shape[0]):
        index = np.where(np.all(assignments[i] == unq_data, axis=1))[0]
        if len(index):
            probs[i] = counts[index[0]]
    probs = (probs + alpha) / (probs + alpha).sum()

    indicators = {
        var: [Bernoulli(scope=[var], p=0),
              Bernoulli(scope=[var], p=1)]
        for var in scope
    }

    prods = []
    for i in range(assignments.shape[0]):
        children = []
        for j in range(assignments.shape[1]):
            children.append(indicators[scope[j]][assignments[i, j]])
            # children.append(Bernoulli(scope=[scope[j]], p=assignments[i, j]))
        prods.append(Product(children=children))

    if len(prods) > 1:
        disj = Sum(children=prods, weights=probs)
    else:
        disj = prods[0]

    assign_ids(disj)
    rebuild_scopes_bottom_up(disj)

    return disj
예제 #2
0
def create_SPN2():
    from spn.structure.Base import assign_ids
    from spn.structure.Base import rebuild_scopes_bottom_up

    from spn.algorithms.Validity import is_valid
    from spn.structure.leaves.parametric.Parametric import Categorical

    from spn.structure.Base import Sum, Product

    p0 = Product(children=[
        Categorical(p=[0.3, 0.7], scope=1),
        Categorical(p=[0.4, 0.6], scope=2)
    ])
    p1 = Product(children=[
        Categorical(p=[0.5, 0.5], scope=1),
        Categorical(p=[0.6, 0.4], scope=2)
    ])
    s1 = Sum(weights=[0.3, 0.7], children=[p0, p1])
    p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1])
    p3 = Product(children=[
        Categorical(p=[0.2, 0.8], scope=0),
        Categorical(p=[0.3, 0.7], scope=1)
    ])
    p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)])
    spn = Sum(weights=[0.4, 0.6], children=[p2, p4])

    assign_ids(spn)
    rebuild_scopes_bottom_up(spn)

    val, msg = is_valid(spn)
    assert val, msg

    return spn
예제 #3
0
    def test_spn_to_str_and_back(self):
        self.check_obj_and_reconstruction(
            Categorical(p=[0.1, 0.2, 0.7], scope=0))

        self.check_obj_and_reconstruction(Gaussian(mean=0, stdev=10, scope=0))
        self.check_obj_and_reconstruction(
            Gaussian(mean=1.2, stdev=1.5, scope=0))

        self.check_obj_and_reconstruction(Gaussian(mean=-1.2, stdev=1,
                                                   scope=0))

        gamma = Gamma(alpha=1, beta=2, scope=0)
        lnorm = LogNormal(mean=1, stdev=2, scope=0)

        self.check_obj_and_reconstruction(gamma)

        self.check_obj_and_reconstruction(lnorm)

        root = Sum(children=[gamma, lnorm], weights=[0.2, 0.8])
        assign_ids(root)
        rebuild_scopes_bottom_up(root)
        self.check_obj_and_reconstruction(root)

        root = 0.3 * (Gaussian(mean=0, stdev=1, scope=0) * Gaussian(
            mean=1, stdev=1, scope=1)) + 0.7 * (Gaussian(
                mean=2, stdev=1, scope=0) * Gaussian(mean=3, stdev=1, scope=1))

        self.check_obj_and_reconstruction(root)
예제 #4
0
def marginalize(node, scope):
    assert isinstance(scope, set), "scope must be a set"

    def marg_recursive(node):
        node_scope = set(node.scope)

        if node_scope.issubset(scope):
            return None

        if isinstance(node, Leaf):
            if len(node.scope) > 1:
                raise Exception('Leaf Node with |scope| > 1')

            return node

        newNode = node.__class__()

        #a sum node gets copied with all its children, or gets removed completely
        if isinstance(node, Sum):
            newNode.weights.extend(node.weights)

        for i, c in enumerate(node.children):
            newChildren = marg_recursive(c)
            if newChildren is None:
                continue

            newNode.children.append(newChildren)
        return newNode

    newNode = marg_recursive(node)
    rebuild_scopes_bottom_up(newNode)
    newNode = prune(newNode)
    assert is_valid(newNode)
    assign_ids(node)
    return newNode
예제 #5
0
def create_spflow_spn(n_feats):
    gaussians1 = []
    gaussians2 = []
    for i in range(n_feats):
        g1 = Gaussian(np.random.randn(), np.random.rand(), scope=i)
        g2 = Gaussian(np.random.randn(), np.random.rand(), scope=i)
        gaussians1.append(g1)
        gaussians2.append(g2)

    prods1 = []
    prods2 = []
    for i in range(0, n_feats, 2):
        p1 = Product([gaussians1[i], gaussians1[i + 1]])
        p2 = Product([gaussians2[i], gaussians2[i + 1]])
        prods1.append(p1)
        prods2.append(p2)

    sums = []
    for i in range(n_feats // 2):
        s = Sum(weights=[0.5, 0.5], children=[prods1[i], prods2[i]])
        sums.append(s)

    spflow_spn = Product(sums)
    assign_ids(spflow_spn)
    rebuild_scopes_bottom_up(spflow_spn)
    return spflow_spn
예제 #6
0
def create_spflow_spn(n_feats, ctype=Gaussian):
    children1 = []
    children2 = []
    for i in range(n_feats):
        if ctype == Gaussian:
            c1 = Gaussian(np.random.randn(), np.random.rand(), scope=i)
            c2 = Gaussian(np.random.randn(), np.random.rand(), scope=i)
        else:
            #c1 = Bernoulli(p=1.0, scope=i)
            #c2 = Bernoulli(p=1.0, scope=i)
            c1 = Bernoulli(p=np.random.rand(), scope=i)
            c2 = Bernoulli(p=np.random.rand(), scope=i)

        children1.append(c1)
        children2.append(c2)

    prods1 = []
    prods2 = []
    for i in range(0, n_feats, 2):
        p1 = Product([children1[i], children1[i + 1]])
        p2 = Product([children2[i], children2[i + 1]])
        prods1.append(p1)
        prods2.append(p2)

    sums = []
    for i in range(n_feats // 2):
        s = Sum(weights=[0.5, 0.5], children=[prods1[i], prods2[i]])
        sums.append(s)

    spflow_spn = Product(sums)
    assign_ids(spflow_spn)
    rebuild_scopes_bottom_up(spflow_spn)
    return spflow_spn
예제 #7
0
    def test_sum(self):
        spn = Product()
        for s in range(7):
            spn.children.append(Leaf(scope=s))

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)

        new_spn = SPN_Reshape(spn, 2)

        print(spn)
def create_conj(data, scope, alpha):

    conj = Product(children=[
        Bernoulli(scope=[scope[k]],
                  p=(data[0][k] * data.shape[0] + alpha) /
                  (data.shape[0] + 2 * alpha)) for k in range(len(scope))
    ])

    assign_ids(conj)
    rebuild_scopes_bottom_up(conj)

    return conj
    def _deserialize_model(self, model):
        rootID = model.rootNode
        featureType = model.featureType
        name = model.name
        if name == "":
            name = None
        rootNodes = self._binary_deserialize_graph(model.nodes)
        for root in rootNodes:
            rebuild_scopes_bottom_up(root)
            assert is_valid(root), "SPN invalid after deserialization"

        rootNode = next((root for root in rootNodes if root.id == rootID), None)
        if rootNode is None:
            logger.error(f"Did not find serialized root node {rootID}")
        return SPNModel(rootNode, featureType, name)
def create_naive_fact(data, scope, alpha):
    """
    It returns a naive factorization of the data.
    Laplace's correction is not needed, but if not used may cause underflow.
    """

    probs = (np.sum(data, axis=0) + alpha) / (data.shape[0] + 2 * alpha)

    naive_fact = Product(children=[
        Bernoulli(p=probs[k], scope=[scope[k]]) for k in range(len(scope))
    ])

    assign_ids(naive_fact)
    rebuild_scopes_bottom_up(naive_fact)

    return naive_fact
예제 #11
0
    def test_torch_vs_tf_time(self):
        # Create sample data
        from sklearn.datasets.samples_generator import make_blobs
        import tensorflow as tf
        from time import time

        X, y = make_blobs(n_samples=10,
                          centers=3,
                          n_features=2,
                          random_state=0)
        X = X.astype(np.float32)

        # SPFLow implementation
        g00 = Gaussian(mean=0.0, stdev=1.0, scope=0)
        g10 = Gaussian(mean=1.0, stdev=2.0, scope=1)
        g01 = Gaussian(mean=3.0, stdev=2.0, scope=0)
        g11 = Gaussian(mean=5.0, stdev=1.0, scope=1)
        p0 = Product(children=[g00, g10])
        p1 = Product(children=[g01, g11])
        s = Sum(weights=[0.2, 0.8], children=[p0, p1])
        assign_ids(s)
        rebuild_scopes_bottom_up(s)

        # Convert
        tf_spn, data_placeholder, variable_dict = spn_to_tf_graph(s, data=X)
        torch_spn = SumNode.from_spn(s)

        # Optimizer
        lr = 0.001
        tf_optim = tf.train.AdamOptimizer(lr)
        torch_optim = optim.Adam(torch_spn.parameters(), lr)

        t0 = time()
        epochs = 10
        optimize_tf_graph(tf_spn,
                          variable_dict,
                          data_placeholder,
                          X,
                          epochs=epochs,
                          optimizer=tf_optim)
        t1 = time()
        optimize_torch(torch_spn, X, epochs=epochs, optimizer=torch_optim)
        t2 = time()

        print("Tensorflow took: ", t1 - t0)
        print("PyTorch took: ", t2 - t1)
예제 #12
0
def complete_layers(layer_nodes, current_node_type=Sum, depth=None):
    # all leaves should be at same depth
    root_layer = False
    if depth is None:
        root_layer = True
        depth = get_depth(layer_nodes[0])

    if depth == 2:
        return

    children_layer = []
    if current_node_type == Sum:
        for i in range(len(layer_nodes)):
            n = layer_nodes[i]
            assert isinstance(n, Sum)
            for j in range(len(n.children)):
                c = n.children[j]
                if not isinstance(c, Product):
                    n.children[j] = Product([c])
            children_layer.extend(n.children)
        children_layer_type = Product
    elif current_node_type == Product:
        for i in range(len(layer_nodes)):
            n = layer_nodes[i]
            assert isinstance(n, Product)
            for j in range(len(n.children)):
                c = n.children[j]
                if not isinstance(c, Sum):
                    n.children[j] = Sum([1.0], [c])
            children_layer.extend(n.children)
        children_layer_type = Sum
    else:
        raise Exception('node type' + str(current_node_type))

    complete_layers(children_layer,
                    current_node_type=children_layer_type,
                    depth=depth - 1)

    if root_layer:
        rebuild_scopes_bottom_up(layer_nodes[0])
        assign_ids(layer_nodes[0])
예제 #13
0
    def __init__(self):
        p0 = Product(children=[
            Categorical(p=[0.3, 0.7], scope=1),
            Categorical(p=[0.4, 0.6], scope=2)
        ])
        p1 = Product(children=[
            Categorical(p=[0.5, 0.5], scope=1),
            Categorical(p=[0.6, 0.4], scope=2)
        ])
        s1 = Sum(weights=[0.3, 0.7], children=[p0, p1])
        p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1])
        p3 = Product(children=[
            Categorical(p=[0.2, 0.8], scope=0),
            Categorical(p=[0.3, 0.7], scope=1)
        ])
        p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)])

        self.spn = Sum(weights=[0.4, 0.6], children=[p2, p4])

        assign_ids(self.spn)
        rebuild_scopes_bottom_up(self.spn)
예제 #14
0
    def test_equal_to_tf(self):
        # SPFLow implementation
        g00 = Gaussian(mean=0.0, stdev=1.0, scope=0)
        g10 = Gaussian(mean=1.0, stdev=2.0, scope=1)
        g01 = Gaussian(mean=3.0, stdev=2.0, scope=0)
        g11 = Gaussian(mean=5.0, stdev=1.0, scope=1)
        p0 = Product(children=[g00, g10])
        p1 = Product(children=[g01, g11])
        s = Sum(weights=[0.2, 0.8], children=[p0, p1])

        assign_ids(s)
        rebuild_scopes_bottom_up(s)

        # Test for 100 random samples
        data = np.random.randn(100, 2)

        # LL from SPN
        ll = log_likelihood(s, data)

        # PyTorch implementation
        g00 = GaussianNode(mean=0.0, std=1.0, scope=0)
        g10 = GaussianNode(mean=1.0, std=2.0, scope=1)
        g01 = GaussianNode(mean=3.0, std=2.0, scope=0)
        g11 = GaussianNode(mean=5.0, std=1.0, scope=1)
        p0 = ProductNode(children=[g00, g10])
        p1 = ProductNode(children=[g01, g11])
        rootnode = SumNode(weights=[0.2, 0.8], children=[p0, p1])

        datatensor = torch.Tensor(data)
        # LL from pytorch
        ll_torch = rootnode(datatensor)

        # Assert equality
        self.assertTrue(
            np.isclose(np.array(ll).squeeze(),
                       ll_torch.detach().numpy(),
                       atol=DELTA).all())
 def _serialize_model(self, model):
     msg = spflow_capnp.Model.new_message()
     assert is_valid(model.root), "SPN invalid before serialization"
     # Assign (new) IDs to the nodes
     # Keep track of already assigned IDs, so the IDs are 
     # unique for the whole file.
     assign_ids(model.root, self.assignedIDs)
     # Rebuild scopes bottom-up
     rebuild_scopes_bottom_up(model.root)
     msg.rootNode = model.root.id
     msg.numFeatures = len(model.root.scope)
     msg.featureType = model.featureType
     scope = msg.init("scope", len(model.root.scope))
     for i,v in enumerate(model.root.scope):
         scope[i] = self._unwrap_value(v)
     name = ""
     if model.name is not None:
         name = model.name
     msg.name = name
     numNodes = get_number_of_nodes(model.root)
     nodes = msg.init("nodes", numNodes)
     nodeList = ListHandler(nodes)
     self._serialize_graph([model.root], nodeList)
     return msg
예제 #16
0
def getSpn2():
    spn2 = Product(children=[Categorical(p=[0.5, 0.5], scope=0), Categorical(p=[0.2, 0.8], scope=2)])
    assign_ids(spn2)
    rebuild_scopes_bottom_up(spn2)
    return spn2
예제 #17
0
    type_to_param_map=pm_continuous_param_map,
    scope=[0],
    init_weights=b_lf_1_init_weights)
b_lf_2_init_weights = {Gaussian: 0.3, Gamma: 0.7}
# b_lf_2_init_weights = np.array([.3, .7])
b_fat_right_leaf_2, _priors = type_mixture_leaf_factory(
    leaf_type='pm',
    leaf_meta_type=MetaType.REAL,
    type_to_param_map=pm_continuous_param_map,
    scope=[1],
    init_weights=b_lf_2_init_weights)
l_r_prod.children = [b_fat_right_leaf_1, b_fat_right_leaf_2]

#
# composing
rebuild_scopes_bottom_up(root)
assign_ids(root)
print(root)
print(spn_to_str_equation(root))

global_W = compute_global_type_weights(root)
print('GLOBAL_W', global_W)

global_W = compute_global_type_weights(root, aggr_type=True)
print('GLOBAL_W', global_W)

gw_map = compute_leaf_global_mix_weights(root)
print('G MIX W', gw_map)

part_map = compute_partition_id_map(root)
print('PARTITION MAP', part_map)
예제 #18
0
파일: SPMN.py 프로젝트: c0derzer0/SPFlow
    def __learn_spmn_structure(self, remaining_vars_data, remaining_vars_scope,
                               curr_information_set_scope, index):

        logging.info(
            f'start of new recursion in __learn_spmn_structure method of SPMN')
        logging.debug(f'remaining_vars_scope: {remaining_vars_scope}')
        logging.debug(
            f'curr_information_set_scope: {curr_information_set_scope}')

        # rest set is remaining variables excluding the variables in current information set
        rest_set_scope = [
            var_scope for var_scope in remaining_vars_scope
            if var_scope not in curr_information_set_scope
        ]

        logging.debug(f'rest_set_scope: {rest_set_scope}')

        scope_index = sum([len(x) for x in self.params.partial_order[:index]])
        next_scope_index = sum(
            [len(x) for x in self.params.partial_order[:index + 1]])

        if remaining_vars_scope == curr_information_set_scope:
            # this is last information set in partial order. Base case of recursion

            # test if current information set is a decision node
            if self.params.partial_order[index][
                    0] in self.params.decision_nodes:
                raise Exception(
                    f'last information set of partial order either contains random '
                    f'and utility variables or just a utility variable. '
                    f'This contains decision variable: {self.params.partial_order[index][0]}'
                )

            else:
                # contains just the random and utility variables

                logging.info(
                    f'at last information set of this recursive call: {curr_information_set_scope}'
                )
                ds_context_last_information_set = get_ds_context(
                    remaining_vars_data, remaining_vars_scope, self.params)

                if self.params.util_to_bin:

                    last_information_set_spn = learn_parametric(
                        remaining_vars_data,
                        ds_context_last_information_set,
                        min_instances_slice=20,
                        initial_scope=remaining_vars_scope)

                else:

                    last_information_set_spn = learn_mspn_for_spmn(
                        remaining_vars_data,
                        ds_context_last_information_set,
                        min_instances_slice=20,
                        initial_scope=remaining_vars_scope)

            logging.info(f'created spn at last information set')
            return last_information_set_spn

        # test for decision node. test if current information set is a decision node
        elif self.params.partial_order[index][0] in self.params.decision_nodes:

            decision_node = self.params.partial_order[index][0]

            logging.info(f'Encountered Decision Node: {decision_node}')

            # cluster the data from remaining variables w.r.t values of decision node
            clusters_on_next_remaining_vars, dec_vals = split_on_decision_node(
                remaining_vars_data)

            decision_node_children_spns = []
            index += 1

            next_information_set_scope = np.array(
                range(next_scope_index, next_scope_index +
                      len(self.params.partial_order[index]))).tolist()

            next_remaining_vars_scope = rest_set_scope
            self.set_next_operation('Any')

            logging.info(f'split clusters based on decision node values')
            for cluster_on_next_remaining_vars in clusters_on_next_remaining_vars:

                decision_node_children_spns.append(
                    self.__learn_spmn_structure(cluster_on_next_remaining_vars,
                                                next_remaining_vars_scope,
                                                next_information_set_scope,
                                                index))

            decision_node_spn_branch = Max(
                dec_idx=scope_index,
                dec_values=dec_vals,
                children=decision_node_children_spns,
                feature_name=decision_node)

            assign_ids(decision_node_spn_branch)
            rebuild_scopes_bottom_up(decision_node_spn_branch)
            logging.info(f'created decision node')
            return decision_node_spn_branch

        # testing for independence
        else:

            curr_op = self.get_curr_operation()
            logging.debug(
                f'curr_op at prod node (independence test): {curr_op}')

            if curr_op != 'Sum':  # fails if correlated variable set found in previous recursive call.
                # Without this condition code keeps looping at this stage

                ds_context = get_ds_context(remaining_vars_data,
                                            remaining_vars_scope, self.params)

                split_cols = get_split_cols_RDC_py()
                data_slices_prod = split_cols(remaining_vars_data, ds_context,
                                              remaining_vars_scope)

                logging.debug(
                    f'{len(data_slices_prod)} slices found at data_slices_prod: '
                )

                prod_children = []
                next_remaining_vars_scope = []
                independent_vars_scope = []

                for correlated_var_set_cluster, correlated_var_set_scope, weight in data_slices_prod:

                    if any(var_scope in correlated_var_set_scope
                           for var_scope in rest_set_scope):

                        next_remaining_vars_scope.extend(
                            correlated_var_set_scope)

                    else:
                        # this variable set of current information set is
                        # not correlated to any variable in the rest set

                        logging.info(
                            f'independent variable set found: {correlated_var_set_scope}'
                        )

                        ds_context_prod = get_ds_context(
                            correlated_var_set_cluster,
                            correlated_var_set_scope, self.params)

                        if self.params.util_to_bin:

                            independent_var_set_prod_child = learn_parametric(
                                correlated_var_set_cluster,
                                ds_context_prod,
                                min_instances_slice=20,
                                initial_scope=correlated_var_set_scope)

                        else:

                            independent_var_set_prod_child = learn_mspn_for_spmn(
                                correlated_var_set_cluster,
                                ds_context_prod,
                                min_instances_slice=20,
                                initial_scope=correlated_var_set_scope)
                        independent_vars_scope.extend(correlated_var_set_scope)
                        prod_children.append(independent_var_set_prod_child)

                logging.info(
                    f'correlated variables over entire remaining variables '
                    f'at prod, passed for next recursion: '
                    f'{next_remaining_vars_scope}')

                # check if all variables in current information set are consumed
                if all(var_scope in independent_vars_scope
                       for var_scope in curr_information_set_scope):

                    index += 1
                    next_information_set_scope = np.array(
                        range(
                            next_scope_index, next_scope_index +
                            len(self.params.partial_order[index]))).tolist()

                    # since current information set is totally consumed
                    next_remaining_vars_scope = rest_set_scope

                else:
                    # some variables in current information set still remain
                    index = index

                    next_information_set_scope = set(
                        curr_information_set_scope) - set(
                            independent_vars_scope)
                    next_remaining_vars_scope = next_information_set_scope | set(
                        rest_set_scope)

                    # convert unordered sets of scope to sorted lists to keep in sync with partial order
                    next_information_set_scope = sorted(
                        list(next_information_set_scope))
                    next_remaining_vars_scope = sorted(
                        list(next_remaining_vars_scope))

                self.set_next_operation('Sum')

                next_remaining_vars_data = column_slice_data_by_scope(
                    remaining_vars_data, remaining_vars_scope,
                    next_remaining_vars_scope)

                logging.info(
                    f'independence test completed for current information set {curr_information_set_scope} '
                    f'and rest set {rest_set_scope} ')

                remaining_vars_prod_child = self.__learn_spmn_structure(
                    next_remaining_vars_data, next_remaining_vars_scope,
                    next_information_set_scope, index)

                prod_children.append(remaining_vars_prod_child)

                product_node = Product(children=prod_children)
                assign_ids(product_node)
                rebuild_scopes_bottom_up(product_node)

                logging.info(f'created product node')
                return product_node

            # Cluster the data
            else:

                curr_op = self.get_curr_operation()
                logging.debug(f'curr_op at sum node (cluster test): {curr_op}')

                split_rows = get_split_rows_KMeans()  # from SPMNHelper.py

                if self.cluster_by_curr_information_set:

                    curr_information_set_data = column_slice_data_by_scope(
                        remaining_vars_data, remaining_vars_scope,
                        curr_information_set_scope)

                    ds_context_sum = get_ds_context(
                        curr_information_set_data, curr_information_set_scope,
                        self.params)
                    data_slices_sum, km_model = split_rows(
                        curr_information_set_data, ds_context_sum,
                        curr_information_set_scope)

                    logging.info(
                        f'split clusters based on current information set {curr_information_set_scope}'
                    )

                else:
                    # cluster on whole remaining variables
                    ds_context_sum = get_ds_context(remaining_vars_data,
                                                    remaining_vars_scope,
                                                    self.params)
                    data_slices_sum, km_model = split_rows(
                        remaining_vars_data, ds_context_sum,
                        remaining_vars_scope)

                    logging.info(
                        f'split clusters based on whole remaining variables {remaining_vars_scope}'
                    )

                sum_node_children = []
                weights = []
                index = index
                logging.debug(
                    f'{len(data_slices_sum)} clusters found at data_slices_sum'
                )

                cluster_num = 0
                labels_array = km_model.labels_
                logging.debug(
                    f'cluster labels of rows: {labels_array} used to cluster data on '
                    f'total remaining variables {remaining_vars_scope}')

                for cluster, scope, weight in data_slices_sum:

                    self.set_next_operation("Prod")

                    # cluster whole remaining variables based on clusters formed.
                    # below methods are useful if clusters were formed on just the current information set

                    cluster_indices = get_row_indices_of_cluster(
                        labels_array, cluster_num)
                    cluster_on_remaining_vars = row_slice_data_by_indices(
                        remaining_vars_data, cluster_indices)

                    # logging.debug(np.array_equal(cluster_on_remaining_vars, cluster ))

                    sum_node_children.append(
                        self.__learn_spmn_structure(
                            cluster_on_remaining_vars, remaining_vars_scope,
                            curr_information_set_scope, index))

                    weights.append(weight)

                    cluster_num += 1

                sum_node = Sum(weights=weights, children=sum_node_children)

                assign_ids(sum_node)
                rebuild_scopes_bottom_up(sum_node)
                logging.info(f'created sum node')
                return sum_node
예제 #19
0
    #
    # RANDOM STRUCTURE LEARNING
    learn_start_t = perf_counter()
    spn = learn_rand_spn(data,
                         ds_context,
                         min_instances_slice=args.min_instances,
                         row_a=args.beta_rows[0],
                         row_b=args.beta_rows[1],
                         col_a=args.beta_cols[0],
                         col_b=args.beta_cols[1],
                         col_threshold=args.col_split_threshold,
                         memory=None,
                         rand_gen=rand_gen)

    rebuild_scopes_bottom_up(spn)
    assign_ids(spn)
    learn_end_t = perf_counter()

    stats = get_structure_stats_dict(spn)
    logging.info('\n\nLearned spn in {} with stats:\n\t{}'.format(
        learn_end_t - learn_start_t, stats))

    print(spn_to_str_equation(spn))
    print(spn.scope)

    #
    # storing the spn on file
    spn_output_path = os.path.join(out_path, 'spn.model.pkl')
    store_start_t = perf_counter()
    with open(spn_output_path, 'wb') as f:
예제 #20
0
def learn_spmn_structure(train_data, index, scope_index, params):


    train_data = train_data
    curr_var_set = params.partial_order[index]

    if params.partial_order[index][0] in  params.decision_nodes:

        decision_node = params.partial_order[index][0]
        cl, dec_vals= split_on_decision_node(train_data, curr_var_set)
        spn0 = []
        index= index+1
        set_next_operation("None")

        for c in cl:

            if index < len(params.partial_order):

                spn0.append(learn_spmn_structure(c, index, scope_index, params))
                spn = Max(dec_values=dec_vals, children=spn0, feature_name=decision_node)

            else:
                spn = Max(dec_values=dec_vals, children=None, feature_name=decision_node)

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)
        return spn



    else:

        curr_train_data_prod, curr_train_data = get_curr_train_data_prod(train_data, curr_var_set)

        split_cols = get_split_cols_RDC_py()
        scope_prod = get_scope_prod(curr_train_data_prod, scope_index, params.feature_names)

        ds_context_prod = get_ds_context_prod(curr_train_data_prod, scope_prod, index, scope_index, params)

        data_slices_prod = split_cols(curr_train_data_prod, ds_context_prod, scope_prod)
        curr_op = get_next_operation()


        if len(data_slices_prod)>1 or curr_op == "Prod" or index == len(params.partial_order) :
            set_next_operation("Sum")

            if params.util_to_bin :

                spn0 = learn_parametric(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope= scope_prod)

            else:

                spn0 = learn_mspn(curr_train_data_prod, ds_context_prod, min_instances_slice=20,
                                    initial_scope=scope_prod)

            index = index + 1
            scope_index = scope_index +curr_train_data_prod.shape[1]

            if index < len(params.partial_order):

                spn1 = learn_spmn_structure(curr_train_data, index, scope_index, params)
                spn = Product(children=[spn0, spn1])

                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

            else:
                spn = spn0
                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

        else:

            split_rows = get_split_rows_KMeans()
            scope_sum = list(range(train_data.shape[1]))

            ds_context_sum = get_ds_context_sum(train_data, scope_sum, index, scope_index, params)
            data_slices_sum = split_rows(train_data, ds_context_sum, scope_sum)

            spn0 = []
            weights = []
            index = index

            if index < len(params.partial_order):

                for cl, scop, weight in data_slices_sum:

                    set_next_operation("Prod")
                    spn0.append(learn_spmn_structure(cl, index, scope_index, params))
                    weights.append(weight)

                spn = Sum(weights=weights, children=spn0)
                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)
        return spn
예제 #21
0
def build_xpc_bottom_up(data, part_root, dtree_dict, det_level, leaves, alpha):
    """
    Build the XPC induced by the partitions tree in a bottom up way.
     The building process is based on the post-order traversal exploration of the partitions tree
    :param alpha: smoothing factor
    :param det_level: 0 for non det., 1 for relaxed det. and 2 for det.
    :param dtree_dict: None if no dependency tree has to be respected, a dictionary of dtree otherwise
    :param part_root: A random partitions tree
    :param data: The data to model
    :param leaves: multivariate leaf function
    :return: the XPC induced by the partition tree
    """

    partitions_stack = [part_root]
    pc_nodes_stack = []
    last_part_exp = None

    while partitions_stack:

        part = partitions_stack[-1]

        if not part.is_partitioned() or (last_part_exp
                                         in part.get_sub_partitions()):

            if part.is_partitioned():

                pc_child_nodes = pc_nodes_stack[-len(part.get_sub_partitions()
                                                     ):]
                pc_nodes_stack = pc_nodes_stack[:len(pc_nodes_stack) -
                                                len(part.get_sub_partitions())]

                if part.is_horizontally_partitioned:

                    weights = [
                        len(sub_part.row_ids) / len(part.row_ids)
                        for sub_part in part.get_sub_partitions()
                    ]
                    pc_sum_node = Sum(weights=weights, children=pc_child_nodes)
                    pc_nodes_stack.append(pc_sum_node)

                else:

                    pc_child_nodes_ = []
                    for c in pc_child_nodes:
                        if isinstance(c,
                                      Product) or (isinstance(c, Sum)
                                                   and len(c.children) == 1):
                            pc_child_nodes_.extend(c.children)
                        else:
                            pc_child_nodes_.append(c)
                    pc_prod_node = Product(children=pc_child_nodes_)
                    pc_nodes_stack.append(pc_prod_node)

            else:

                leaf_pc = create_leaf_pc(data, part, leaves, dtree_dict,
                                         det_level, alpha)
                pc_nodes_stack.append(leaf_pc)

            last_part_exp = partitions_stack.pop()
        else:
            partitions_stack.extend(part.get_sub_partitions()[::-1])

    xpc = pc_nodes_stack[0]
    assign_ids(xpc)
    rebuild_scopes_bottom_up(xpc)

    return xpc
예제 #22
0
def create_expc(data,
                ensemble_dim,
                sd_level,
                det_level,
                min_part_inst,
                conj_len,
                arity,
                leaves,
                alpha=0.01,
                bagging=False,
                max_parts=1000,
                random_seed=42):

    if sd_level not in SD_LEVELS:
        raise StructDecError()

    if det_level not in DET_LEVELS:
        raise DetError()

    if arity < 2 or arity > 2**conj_len:
        raise ArityError()

    if sd_level == SD_LEVEL_2 and conj_len == 1:
        raise NoRandomness()

    print('Generating random partitionings..')
    np.random.seed(random_seed)

    str_dec = (sd_level == SD_LEVEL_1 or sd_level == SD_LEVEL_2)

    if sd_level == SD_LEVEL_2:
        uncond_vars = greedy_vars_ordering(data, conj_len)
    else:
        uncond_vars = np.arange(data.shape[1]).tolist()

    conj_vars_l = [None] * ensemble_dim
    cl_parts_l = [None] * ensemble_dim
    n_parts_l = [None] * ensemble_dim
    ptrees_l = [None] * ensemble_dim
    data_l = [None] * ensemble_dim
    xpc_l = [None] * ensemble_dim

    for i in range(ensemble_dim):

        if sd_level != SD_LEVEL_2:
            np.random.shuffle(uncond_vars)

        if bagging:
            data_l[i] = data[np.random.choice(a=data.shape[0],
                                              size=data.shape[0] * 70 // 100,
                                              replace=True)]
        else:
            data_l[i] = data

        print(uncond_vars)
        ptrees_l[i], cl_parts_l[i], conj_vars_l[i], n_parts_l[i] = \
            create_random_partitioning(data=data_l[i],
                                       str_dec=str_dec,
                                       min_part_inst=min_part_inst,
                                       conj_len=conj_len,
                                       arity=arity,
                                       max_parts=max_parts,
                                       uncond_vars=uncond_vars)

    if all([n_parts == 1 for n_parts in n_parts_l]):
        raise NoPartitioningFound()

    if sd_level == SD_LEVEL_0 or leaves == create_naive_fact:

        dtree_dict = None
        for i in range(ensemble_dim):
            print('Learning XPC_%s/%s' % (i, ensemble_dim))
            xpc_l[i] = build_xpc_bottom_up(data_l[i], ptrees_l[i], dtree_dict,
                                           det_level, leaves, alpha)

    elif sd_level == SD_LEVEL_1:

        for i in range(ensemble_dim):
            print('Learning XPC_%s/%s' % (i, ensemble_dim))
            #
            # learn a dtree for each XPC
            dtree_dict = create_dtree_dict([data_l[i]], [cl_parts_l[i]],
                                           conj_vars_l[i], alpha)
            xpc_l[i] = build_xpc_bottom_up(data_l[i], ptrees_l[i], dtree_dict,
                                           det_level, leaves, alpha)

    elif sd_level == SD_LEVEL_2:

        #
        # learn a dtree for the ensemble
        print('Learning a dependency tree for the ensemble..')
        dtree_dict = create_dtree_dict(data_l, cl_parts_l,
                                       max(conj_vars_l, key=len), alpha)
        for i in range(ensemble_dim):
            print('Building XPC_%s/%s' % (i, ensemble_dim))
            xpc_l[i] = build_xpc_bottom_up(data_l[i], ptrees_l[i], dtree_dict,
                                           det_level, leaves, alpha)

    expc = Sum(weights=1 / ensemble_dim * np.ones(ensemble_dim),
               children=xpc_l)
    assign_ids(expc)
    rebuild_scopes_bottom_up(expc)

    return expc, n_parts_l
예제 #23
0
def str_to_spn(text, features=None, str_to_spn_lambdas=_str_to_spn):
    from lark import Lark

    ext_name = "\n".join(map(lambda s: "    | " + s,
                             str_to_spn_lambdas.keys()))

    ext_grammar = "\n".join([s for _, s, _ in str_to_spn_lambdas.values()])

    grammar = r"""
%import common.DECIMAL -> DECIMAL
%import common.WS
%ignore WS
%import common.WORD -> WORD
%import common.DIGIT -> DIGIT
ALPHANUM: "a".."z"|"A".."Z"|DIGIT
PARAMCHARS: ALPHANUM|"_"
FNAME: ALPHANUM+
PARAMNAME: PARAMCHARS+
NUMBER: DIGIT|DECIMAL
NUMBERS: NUMBER+
list: "[" [NUMBERS ("," NUMBERS)*] "]"


?node: prodnode
    | sumnode
""" + ext_name + r"""

prodnode: "(" [node ("*" node)*] ")"
sumnode: "(" [NUMBERS "*" node ("+" NUMBERS "*" node)*] ")"

""" + ext_grammar

    parser = Lark(grammar, start='node')
    # print(grammar)
    tree = parser.parse(text)

    def tree_to_spn(tree, features):
        tnode = tree.data

        if tnode == "sumnode":
            node = Sum()
            for i in range(int(len(tree.children) / 2)):
                j = 2 * i
                w, c = tree.children[j], tree.children[j + 1]
                node.weights.append(float(w))
                node.children.append(tree_to_spn(c, features))
            return node

        if tnode == "prodnode":
            if len(tree.children) == 1:
                return tree_to_spn(tree.children[0], features)
            node = Product()
            for c in tree.children:
                node.children.append(tree_to_spn(c, features))
            return node

        if tnode in str_to_spn_lambdas:
            return str_to_spn_lambdas[tnode][0](tree, features,
                                                str_to_spn_lambdas[tnode][2],
                                                tree_to_spn)

        raise Exception('Node type not registered: ' + tnode)

    spn = tree_to_spn(tree, features)

    assign_ids(spn)
    rebuild_scopes_bottom_up(spn)
    valid, err = is_valid(spn)
    assert valid, err
    assign_ids(spn)
    return spn