예제 #1
0
def EM_optimization(spn, data, iterations=5, node_updates=_node_updates, skip_validation=False, **kwargs):
    if not skip_validation:
        valid, err = is_valid(spn)
        assert valid, "invalid spn: " + err

    lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

    for _ in range(iterations):
        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node)

        gradients = gradient_backward(spn, lls_per_node)

        R = lls_per_node[:, 0]

        for node_type, func in node_updates.items():
            for node in get_nodes_by_type(spn, node_type):
                func(
                    node,
                    node_lls=lls_per_node[:, node.id],
                    node_gradients=gradients[:, node.id],
                    root_lls=R,
                    all_lls=lls_per_node,
                    all_gradients=gradients,
                    data=data,
                    **kwargs
                )
예제 #2
0
파일: EM.py 프로젝트: willis-hu/SPFlow
def EM_optimization(spn,
                    data,
                    iterations=5,
                    node_updates=_node_updates,
                    **kwargs):
    for _ in range(iterations):
        lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node)

        gradients = gradient_backward(spn, lls_per_node)

        R = lls_per_node[:, 0]

        for node_type, func in node_updates.items():  # TODO: do in parallel
            for node in get_nodes_by_type(spn, node_type):
                func(node,
                     node_lls=lls_per_node[:, node.id],
                     node_gradients=gradients[:, node.id],
                     root_lls=R,
                     all_lls=lls_per_node,
                     all_gradients=gradients,
                     data=data,
                     **kwargs)
예제 #3
0
def feature_gradient(node,
                     data,
                     node_gradient_functions=_node_feature_gradients,
                     lls_per_node=None):
    """
    Feature gradients are computed for the input query and each feature using
    the backwards automatic differentiation. In mathematicl terms, it computes the
    partial derivatives \partial P(X) / \partial X_i
 

    :param node: Node for the gradient calculation
    :param data: data for the computation. NaN values are implicitely marginalized out
    :param lls_per_node: optional for storing the intermediate results
    """

    all_leaves = get_nodes_by_type(node, Leaf)

    if not lls_per_node:
        lls_per_node = np.full((data.shape[0], get_number_of_nodes(node)),
                               np.nan)
    log_likelihood(node, data, lls_matrix=lls_per_node)

    gradients = np.exp(gradient_backward(node, lls_per_node))

    node_gradients = []

    for spn_node in all_leaves:
        i = spn_node.id
        result = node_gradient_functions[type(spn_node)](spn_node, data)
        node_gradients.append(result * gradients[:, i].reshape(-1, 1))

    node_gradients = np.array(node_gradients)

    return np.nansum(node_gradients, axis=0)
예제 #4
0
def generate_code(spn_id, spn, meta_types, floating_data_type):
    """
    Generates inference code for an SPN
    :param target_path: the path the generated C++ code is written to
    :param floating_data_type: data type floating numbers are represented in generated C++ code
    :param spn: root node of an SPN
    :return: code string
    """

    # make sure we have ids
    assign_ids(spn)

    # fill method body according to SPN structure
    method_body = generate_method_body(spn, spn, floating_data_type, 0)

    # build parameters used in generated c++ function
    method_params = []
    passed_params = []
    for i, type in enumerate(meta_types):
        if type == MetaType.DISCRETE:
            method_params += [
                f'vector <int> possibleValues{i}', f'int nullValueIdx{i}'
            ]
            passed_params += [
                f'py::arg("possibleValues{i}")', f'py::arg("nullValueIdx{i}")'
            ]
        elif type == MetaType.REAL:
            method_params += [
                f'bool inverse{i}', f'bool leftMinusInf{i}',
                f'float leftCondition{i}', f'bool rightMinusInf{i}',
                f'float rightCondition{i}', f'bool leftIncluded{i}',
                f'bool rightIncluded{i}', f'float nullValue{i}'
            ]
            passed_params += [
                f'py::arg("inverse{i}")', f'py::arg("leftMinusInf{i}")',
                f'py::arg("leftCondition{i}")', f'py::arg("rightMinusInf{i}")',
                f'py::arg("rightCondition{i}")', f'py::arg("leftIncluded{i}")',
                f'py::arg("rightIncluded{i}")', f'py::arg("nullValue{i}")'
            ]

    value_dictionary = {
        'spn_id': spn_id,
        'method_body': method_body,
        'method_params': ', '.join(method_params),
        'node_count': get_number_of_nodes(spn),
        'passed_params': ', '.join(passed_params),
        'floating_data_type': floating_data_type
    }
    generated_method = replace_template(TemplatePath.METHOD_MASTER,
                                        value_dictionary, 0)
    registrate_method = replace_template(TemplatePath.REGISTRATION_MASTER,
                                         value_dictionary, 0)

    return generated_method, registrate_method
예제 #5
0
def get_node_description(spn, parent_node, size):
    # parent_node.validate()
    parent_type = type(parent_node).__name__
    node_descriptions = dict()
    node_descriptions['num'] = len(parent_node.children)
    nodes = list()
    for i, node in enumerate(parent_node.children):
        node_spn = Copy(node)
        assign_ids(node_spn)
        node_dir = dict()
        node_dir[
            'weight'] = parent_node.weights[i] if parent_type == 'Sum' else 1
        node_dir['size'] = get_number_of_nodes(node) - 1
        node_dir['num_children'] = len(
            node.children) if not isinstance(node, Leaf) else 0
        node_dir['leaf'] = isinstance(node, Leaf)
        node_dir['type'] = type(node).__name__ + ' Node'
        node_dir['split_features'] = [
            list(c.scope) for c in node.children
        ] if not isinstance(node, Leaf) else node.scope
        node_dir['split_features'].sort(key=lambda x: len(x))
        node_dir['depth'] = get_depth(node)
        node_dir['child_depths'] = [get_depth(c) for c in node.children]

        descriptor = node_dir['type']
        if all((d == 0 for d in node_dir['child_depths'])):
            descriptor = 'shallow ' + descriptor
            node_dir['quick'] = 'shallow'
        elif len([d for d in node_dir['child_depths'] if d == 0]) == 1:
            node_dir['quick'] = 'split_one'
            descriptor += ', which separates one feature'
        else:
            node_dir['quick'] = 'deep'
            descriptor = 'deep ' + descriptor
        descriptor = 'a ' + descriptor
        node_dir['descriptor'] = descriptor
        node_dir['short_descriptor'] = descriptor
        node_dir['representative'] = mpe(node_spn, np.array([[np.nan] * size]))
        nodes.append(node_dir)
    node_descriptions['shallow'] = len(
        [d for d in nodes if d['quick'] == 'shallow'])
    node_descriptions['split_one'] = len(
        [d for d in nodes if d['quick'] == 'split_one'])
    node_descriptions['deep'] = len([d for d in nodes if d['quick'] == 'deep'])
    nodes.sort(key=lambda x: x['weight'])
    nodes.reverse()
    node_descriptions['nodes'] = nodes
    return node_descriptions
def test_binary_serialization_roundtrip(tmpdir):
    """Tests the binary serialization for SPFlow SPNs by round-tripping 
    a simple SPN through serialization and de-serialization and comparing
    the graph-structure before and after serialization & de-serialization."""
    h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=1)
    h2 = Histogram([0., 1., 2.], [0.45, 0.55], [1, 1], scope=2)
    h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=1)
    h4 = Histogram([0., 1., 2.], [0.875, 0.125], [1, 1], scope=2)

    p0 = Product(children=[h1, h2])
    p1 = Product(children=[h3, h4])
    spn = Sum([0.3, 0.7], [p0, p1])

    model = SPNModel(spn, featureValueType="uint32")
    query = JointProbability(model)

    binary_file = os.path.join(tmpdir, "test.bin")
    print(f"Test binary file: {binary_file}")

    BinarySerializer(binary_file).serialize_to_file(query)

    deserialized = BinaryDeserializer(binary_file).deserialize_from_file()

    assert (isinstance(deserialized, JointProbability))
    assert (deserialized.batchSize == query.batchSize)
    assert (deserialized.errorModel.error == query.errorModel.error)
    assert (deserialized.errorModel.kind == query.errorModel.kind)
    assert (deserialized.graph.featureType == model.featureType)
    assert (deserialized.graph.name == model.name)

    deserialized = deserialized.graph.root
    assert get_number_of_nodes(spn) == get_number_of_nodes(deserialized)
    assert get_number_of_nodes(spn,
                               Sum) == get_number_of_nodes(deserialized, Sum)
    assert get_number_of_nodes(spn, Product) == get_number_of_nodes(
        deserialized, Product)
    assert get_number_of_nodes(spn, Histogram) == get_number_of_nodes(
        deserialized, Histogram)
    assert get_number_of_edges(spn) == get_number_of_edges(deserialized)
예제 #7
0
def EM_optimization(spn,
                    data,
                    iterations=5,
                    node_updates=_node_updates,
                    skip_validation=False,
                    **kwargs):
    if not skip_validation:
        valid, err = is_valid(spn)
        assert valid, "invalid spn: " + err

    lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

    # node_updates = {Sum_sharedWeights: sum_em_update_shared}
    for _ in range(iterations):
        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, lls_matrix=lls_per_node)  # dtype=data.dtype

        gradients = gradient_backward(spn, lls_per_node)

        weights = [
            node.weights if isinstance(node, Sum_sharedWeights) else None
            for node in get_nodes_by_type(spn)
        ]

        R = lls_per_node[:, 0]
        for node_type, func in node_updates.items():
            for node in get_nodes_by_type(spn, node_type):
                func(node,
                     node_lls=lls_per_node[:, node.id],
                     node_gradients=gradients[:, node.id],
                     root_lls=R,
                     all_lls=lls_per_node,
                     all_gradients=gradients,
                     data=data,
                     spn=spn,
                     weights=weights,
                     **kwargs)
 def _serialize_model(self, model):
     msg = spflow_capnp.Model.new_message()
     assert is_valid(model.root), "SPN invalid before serialization"
     # Assign (new) IDs to the nodes
     # Keep track of already assigned IDs, so the IDs are 
     # unique for the whole file.
     assign_ids(model.root, self.assignedIDs)
     # Rebuild scopes bottom-up
     rebuild_scopes_bottom_up(model.root)
     msg.rootNode = model.root.id
     msg.numFeatures = len(model.root.scope)
     msg.featureType = model.featureType
     scope = msg.init("scope", len(model.root.scope))
     for i,v in enumerate(model.root.scope):
         scope[i] = self._unwrap_value(v)
     name = ""
     if model.name is not None:
         name = model.name
     msg.name = name
     numNodes = get_number_of_nodes(model.root)
     nodes = msg.init("nodes", numNodes)
     nodeList = ListHandler(nodes)
     self._serialize_graph([model.root], nodeList)
     return msg