예제 #1
0
    def linked_kernel_density_estimation(cls,
                                         n_instances,
                                         features,
                                         node_dict=None,
                                         alpha=0.1
                                         # ,batch_size=1,
                                         # sparse=False
                                         ):
        """
        WRITEME
        """

        n_features = len(features)

        # the top one is a sum layer with a single node
        root_node = SumNode()
        root_layer = SumLayerLinked([root_node])

        # second one is a product layer with n_instances nodes
        product_nodes = [ProductNode() for i in range(n_instances)]
        product_layer = ProductLayerLinked(product_nodes)
        # linking them to the root node
        for prod_node in product_nodes:
            root_node.add_child(prod_node, 1. / n_instances)

        # last layer can be a categorical smoothed input
        # or sum_layer + categorical indicator input

        input_layer = None
        layers = None
        n_leaf_nodes = n_features * n_instances

        if node_dict is None:
            # creating a sum_layer with n_leaf_nodes
            sum_nodes = [SumNode() for i in range(n_leaf_nodes)]
            # store them into a layer
            sum_layer = SumLayerLinked(sum_nodes)
            # linking them to the products above
            for i, prod_node in enumerate(product_nodes):
                for j in range(n_features):
                    # getting the next n_features nodes
                    prod_node.add_child(sum_nodes[i * n_features + j])
            # now creating the indicator nodes
            input_layer = \
                CategoricalIndicatorLayerLinked(vars=features)
            # linking the sum nodes to the indicator vars
            for i, sum_node in enumerate(sum_nodes):
                # getting the feature id
                j = i % n_features
                # and thus its number of values
                n_values = features[j]
                # getting the indices of indicators
                start_index = sum(features[:j])
                end_index = start_index + n_values
                indicators = [node for node in input_layer.nodes()
                              ][start_index:end_index]
                for ind_node in indicators:
                    sum_node.add_child(ind_node, 1. / n_values)

            # storing levels
            layers = [sum_layer, product_layer, root_layer]
        else:
            # create a categorical smoothed layer
            input_layer = \
                CategoricalSmoothedLayerLinked(vars=features,
                                               node_dicts=node_dict,
                                               alpha=alpha)
            # it shall contain n_leaf_nodes nodes
            smooth_nodes = list(input_layer.nodes())
            assert len(smooth_nodes) == n_leaf_nodes

            # linking it
            for i, prod_node in enumerate(product_nodes):
                for j in range(n_features):
                    # getting the next n_features nodes
                    prod_node.add_child(smooth_nodes[i * n_features + j])
            # setting the used levels
            layers = [product_layer, root_layer]

        # create the spn from levels
        kern_spn = SpnLinked(input_layer, layers)
        return kern_spn
예제 #2
0
    def layered_linked_spn(cls, root_node):
        """
        Given a simple linked version (parent->children),
        returns a layered one (linked + layers)
        """
        layers = []
        root_layer = None
        input_nodes = []
        layer_nodes = []
        input_layer = None

        # layers.append(root_layer)
        previous_level = None

        # collecting nodes to visit
        open = deque()
        next_open = deque()
        closed = set()

        open.append(root_node)

        while open:
            # getting a node
            current_node = open.popleft()
            current_id = current_node.id

            # has this already been seen?
            if current_id not in closed:
                closed.add(current_id)
                layer_nodes.append(current_node)
                # print('CURRENT NODE')
                # print(current_node)

                # expand it
                for child in current_node.children:
                    # only for non leaf nodes
                    if (isinstance(child, SumNode)
                            or isinstance(child, ProductNode)):
                        next_open.append(child)
                    else:
                        # it must be an input node
                        if child.id not in closed:
                            input_nodes.append(child)
                            closed.add(child.id)

            # open is now empty, but new open not
            if (not open):
                # swap them
                open = next_open
                next_open = deque()

                # and create a new level alternating type
                if previous_level is None:
                    # it is the first level
                    if isinstance(root_node, SumNode):
                        previous_level = SumLayerLinked([root_node])
                    elif isinstance(root_node, ProductNode):
                        previous_level = ProductLayerLinked([root_node])
                elif isinstance(previous_level, SumLayerLinked):
                    previous_level = ProductLayerLinked(layer_nodes)
                elif isinstance(previous_level, ProductLayerLinked):
                    previous_level = SumLayerLinked(layer_nodes)

                layer_nodes = []

                layers.append(previous_level)

        #
        # finishing layers
        #

        #
        # checking for CLTreeNodes
        cltree_leaves = False
        for node in input_nodes:
            if isinstance(node, CLTreeNode):
                cltree_leaves = True
                break

        if cltree_leaves:
            input_layer = CategoricalCLInputLayerLinked(input_nodes)
        else:
            # otherwiise assuming all input nodes are homogeneous
            if isinstance(input_nodes[0], CategoricalSmoothedNode):
                # print('SMOOTH LAYER')
                input_layer = CategoricalSmoothedLayerLinked(input_nodes)
            elif isinstance(input_nodes[0], CategoricalIndicatorNode):
                input_layer = CategoricalIndicatorLayerLinked(input_nodes)

        spn = SpnLinked(input_layer=input_layer, layers=layers[::-1])
        return spn
예제 #3
0
    def linked_naive_factorization(cls, features, node_dict=None, alpha=0.1):
        """
        WRITEME
        """
        n_features = len(features)

        # create an input layer
        input_layer = None
        layers = None

        # first layer is a product layer with n_feature children
        root_node = ProductNode()
        root_layer = ProductLayerLinked([root_node])

        # second is a sum node on an indicator layer
        if node_dict is None:
            # creating sum nodes
            sum_nodes = [SumNode() for i in range(n_features)]
            # linking to the root
            for node in sum_nodes:
                root_node.add_child(node)
            # store into a level
            sum_layer = SumLayerLinked(sum_nodes)
            # now create an indicator layer
            input_layer = CategoricalIndicatorLayerLinked(vars=features)
            # and linking it
            # TODO make this a function
            for i, sum_node in enumerate(sum_nodes):
                # getting the feature id
                j = i % n_features
                # and thus its number of values
                n_values = features[j]
                # getting the indices of indicators
                start_index = sum(features[:j])
                end_index = start_index + n_values
                indicators = [node for node in input_layer.nodes()
                              ][start_index:end_index]
                for ind_node in indicators:
                    sum_node.add_child(ind_node, 1. / n_values)

            # collecting layers
            layers = [sum_layer, root_layer]

        # or a categorical smoothed layer
        else:
            input_layer = CategoricalSmoothedLayerLinked(vars=features,
                                                         node_dicts=node_dict,
                                                         alpha=alpha)
            # it shall contain n_features nodes
            smooth_nodes = list(input_layer.nodes())
            assert len(smooth_nodes) == n_features
            for node in smooth_nodes:
                root_node.add_child(node)

            # set layers accordingly
            layers = [root_layer]

        # build the spn
        naive_fact_spn = SpnLinked(input_layer, layers)

        return naive_fact_spn
예제 #4
0
    def linked_random_spn_top_down(cls,
                                   vars,
                                   n_layers,
                                   n_max_children,
                                   n_scope_children,
                                   max_scope_split,
                                   merge_prob=0.5,
                                   rand_gen=None):
        """
        WRITEME
        """
        def cluster_scopes(scope_list):
            cluster_dict = {}

            for i, var in enumerate(scope_list):
                cluster_dict[var] += {i}
            return cluster_dict

        def cluster_set_scope(scope_list):
            return {scope for scope in scope_list}

        def link_leaf_to_input_layer(sum_leaf, scope_var, input_layer,
                                     rand_gen):
            for indicator_node in input_layer.nodes():
                if indicator_node.var == scope_var:
                    rand_weight = rand_gen.random()
                    sum_leaf.add_child(indicator_node, rand_weight)
                    # print(sum_leaf, indicator_node, rand_weight)
            # normalizing
            sum_leaf.normalize()

        #
        # creating a product layer
        #

        def build_product_layer(parent_layer, parent_scope_list,
                                n_max_children, n_scope_children, input_layer,
                                rand_gen):

            # grouping the scopes of the parents
            scope_clusters = cluster_set_scope(parent_scope_list)
            # for each scope add a fixed number of children
            children_lists = {
                scope: [
                    ProductNode(var_scope=scope)
                    for i in range(n_scope_children)
                ]
                for scope in scope_clusters
            }
            # counting which node is used
            children_counts = {
                scope: [0 for i in range(n_scope_children)]
                for scope in scope_clusters
            }
            # now link those randomly to their parent
            for parent, scope in zip(parent_layer.nodes(), parent_scope_list):
                # only for nodes not becoming leaves
                if len(scope) > 1:
                    # sampling at most n_max_children from those in the same
                    # scope
                    children_scope_list = children_lists[scope]
                    sample_length = min(len(children_scope_list),
                                        n_max_children)
                    sampled_ids = rand_gen.sample(range(n_scope_children),
                                                  sample_length)
                    sampled_children = [None for i in range(sample_length)]
                    for i, id in enumerate(sampled_ids):
                        # getting the sampled child
                        sampled_children[i] = children_scope_list[id]
                        # updating its counter
                        children_counts[scope][id] += 1

                    for child in sampled_children:
                        # parent is a sum layer, we must set a random weight
                        rand_weight = rand_gen.random()
                        parent.add_child(child, rand_weight)

                    # we can now normalize it
                    parent.normalize()
                else:
                    # binding the node to the input layer
                    (scope_var, ) = scope
                    link_leaf_to_input_layer(parent, scope_var, input_layer,
                                             rand_gen)

            # pruning those children never used
            for scope in children_lists.keys():
                children_scope_list = children_lists[scope]
                scope_counts = children_counts[scope]
                used_children = [
                    child
                    for count, child in zip(scope_counts, children_scope_list)
                    if count > 0
                ]
                children_lists[scope] = used_children

            # creating the layer and new scopelist
            # print('children list val', children_lists.values())
            children_list = [
                child for child in itertools.chain.from_iterable(
                    children_lists.values())
            ]
            scope_list = [
                key for key, child_list in children_lists.items()
                for elem in child_list
            ]
            # print('children list', children_list)
            # print('scope list', scope_list)
            prod_layer = ProductLayerLinked(children_list)

            return prod_layer, scope_list

        def build_sum_layer(parent_layer,
                            parent_scope_list,
                            rand_gen,
                            max_scope_split=-1,
                            merge_prob=0.5):

            # keeping track of leaves
            # leaf_props = []
            scope_clusters = cluster_set_scope(parent_scope_list)

            # looping through all the parent nodes and their scopes
            # in order to decompose their scope
            dec_scope_list = []
            for scope in parent_scope_list:
                # decomposing their scope into k random pieces
                k = len(scope)
                if 1 < max_scope_split <= len(scope):
                    k = rand_gen.randint(2, max_scope_split)
                shuffled_scope = list(scope)
                rand_gen.shuffle(shuffled_scope)
                dec_scopes = [
                    frozenset(shuffled_scope[i::k]) for i in range(k)
                ]
                dec_scope_list.append(dec_scopes)
                # if a decomposed scope consists of only one var, generate a
                # leaf
                # leaves = [(parent, (dec_scope,))
                #           for dec_scope in dec_scopes if len(dec_scope) == 1]
                # leaf_props.extend(leaves)

            # generating a unique decomposition
            used_decs = {}
            children_list = []
            scope_list = []
            for parent, decs in zip(parent_layer.nodes(), dec_scope_list):
                merge_count = 0
                for scope in decs:
                    sum_node = None
                    try:
                        rand_perc = rand_gen.random()
                        if (merge_count < len(decs) - 1
                                and rand_perc > merge_prob):
                            sum_node = used_decs[scope]
                            merge_count += 1

                        else:
                            raise Exception()
                    except:
                        # create a node for it
                        sum_node = SumNode(var_scope=scope)
                        children_list.append(sum_node)
                        scope_list.append(scope)
                        used_decs[scope] = sum_node

                    parent.add_child(sum_node)

            # unique_dec = {frozenset(dec) for dec in
            #               itertools.chain.from_iterable(dec_scope_list)}
            # print('unique dec', unique_dec)
            # building a dict scope->child
            # children_dict = {scope: SumNode() for scope in unique_dec}
            # now linking parents to their children
            # for parent, scope in zip(parent_layer.nodes(),
            #                          parent_scope_list):
            #     dec_scopes = dec_scope_list[scope]
            #     for dec in dec_scopes:
            # retrieving children
            # adding it
            #         parent.add_child(children_dict[dec])

            # we already have the nodes and their scopes
            # children_list = [child for child in children_dict.values()]
            # scope_list = [scope for scope in children_dict.keys()]

            sum_layer = SumLayerLinked(nodes=children_list)

            return sum_layer, scope_list

        # if no generator is provided, create a new one
        if rand_gen is None:
            rand_gen = random.Random()

        # create input layer
        # _vars = [2, 3, 2, 2, 4]
        input_layer = CategoricalIndicatorLayerLinked(vars=vars)

        # create root layer
        full_scope = frozenset({i for i in range(len(vars))})
        root = SumNode(var_scope=full_scope)
        root_layer = SumLayerLinked(nodes=[root])
        last_layer = root_layer

        # create top scope list
        last_scope_list = [full_scope]

        layers = [root_layer]
        layer_count = 0
        stop_building = False
        while not stop_building:
            # checking for early termination
            # this one leads to split product nodes into leaves
            if layer_count >= n_layers:
                print('Max level reached, trying to stop')
                max_scope_split = -1

            # build a new layer alternating types
            if isinstance(last_layer, SumLayerLinked):
                print('Building product layer')
                last_layer, last_scope_list = \
                    build_product_layer(last_layer,
                                        last_scope_list,
                                        n_max_children,
                                        n_scope_children,
                                        input_layer,
                                        rand_gen)
            elif isinstance(last_layer, ProductLayerLinked):
                print('Building sum layer')
                last_layer, last_scope_list = \
                    build_sum_layer(last_layer,
                                    last_scope_list,
                                    rand_gen,
                                    max_scope_split,
                                    merge_prob)

            # testing for more nodes to expand
            if last_layer.n_nodes() == 0:
                print('Stop building')
                stop_building = True
            else:
                layers.append(last_layer)
                layer_count += 1

        # checking for early termination
        # if not stop_building:
        #     if isinstance(last_layer, ProductLayerLinked):
        # building a sum layer splitting everything into one
        # length scopes
        #         last_sum_layer, last_scope_list = \
        #             build_sum_layer(last_layer,
        #                             last_scope_list,
        #                             rand_gen,
        #                             max_scope_split=-1)
        # then linking each node to the input layer
        #         for sum_leaf, scope in zip(last_sum_layer.nodes(),
        #                                    last_scope_list):
        #             (scope_var,) = scope
        #             link_leaf_to_input_layer(sum_leaf,
        #                                      scope_var,
        #                                      input_layer,
        #                                      rand_gen)
        #     elif isinstance(last_layer, SumLayerLinked):
        #         pass

        # print('LAYERS ', len(layers), '\n')
        # for i, layer in enumerate(layers):
        #     print('LAYER ', i)
        #     print(layer)
        # print('\n')
        spn = SpnLinked(input_layer=input_layer, layers=layers[::-1])
        # testing
        # scope_list = [
        #     frozenset({1, 3, 4}), frozenset({2, 0}), frozenset({1, 3, 4})]
        # sum_layer = SumLayerLinked(nodes=[SumNode(), SumNode(), SumNode()])

        # prod_layer, scope_list = build_product_layer(
        #     sum_layer, scope_list, 2, 3, input_layer, rand_gen)

        # sum_layer1, scope_list_2 = build_sum_layer(prod_layer,
        #                                            scope_list,
        #                                            rand_gen,
        #                                            max_scope_split=2
        #                                            )
        # prod_layer_2, scope_list_3 = build_product_layer(sum_layer1,
        #                                                  scope_list_2,
        #                                                  2,
        #                                                  3,
        #                                                  input_layer,
        #                                                  rand_gen)
        # create spn from layers
        # spn = SpnLinked(input_layer=input_layer,
        #                 layers=[prod_layer_2, sum_layer1,
        #                         prod_layer, sum_layer, root_layer])
        return spn
예제 #5
0
def linked_categorical_input_to_indicators(spn, input_layer=None):
    """
    Convertes a linked spn categorical input layer into an indicator one
    """

    #
    # get child, parent relations for node relinking
    child_assoc = retrieve_children_parent_assoc(spn)

    #
    # get input layer
    cat_input_layer = spn.input_layer()
    assert isinstance(cat_input_layer, CategoricalSmoothedLayerLinked)

    #
    # one indicator node for each var value
    vars = cat_input_layer.vars()
    if not vars:
        vars = list(sorted({node.var for node in cat_input_layer.nodes()}))

    feature_values = cat_input_layer.feature_vals()
    # print('vars', vars)
    # print('feature values', feature_values)

    indicator_nodes = [
        CategoricalIndicatorNode(var, val) for i, var in enumerate(vars)
        for val in range(feature_values[i])
    ]
    # for node in indicator_nodes:
    #     print(node)

    indicator_map = defaultdict(set)
    for ind_node in indicator_nodes:
        indicator_map[ind_node.var].add(ind_node)

    sum_nodes = []
    #
    # as many sum nodes as cat nodes
    for node in cat_input_layer.nodes():

        sum_node = SumNode(var_scope=frozenset([node.var]))
        sum_nodes.append(sum_node)

        for ind_node in sorted(indicator_map[node.var],
                               key=lambda x: x.var_val):
            sum_node.add_child(ind_node,
                               numpy.exp(node._var_probs[ind_node.var_val]))

        #
        # removing links to parents
        parents = child_assoc[node]
        for p_node in parents:
            #
            # assume it to be a product node
            # TODO: generalize
            assert isinstance(p_node, ProductNode)
            p_node.children.remove(node)
            p_node.add_child(sum_node)

    #
    # creating layer
    sum_layer = SumLayerLinked(sum_nodes)

    indicator_layer = CategoricalIndicatorLayerLinked(indicator_nodes)

    cat_input_layer.disconnect_layer()
    spn.set_input_layer(indicator_layer)
    spn.insert_layer(sum_layer, 0)

    return spn