예제 #1
0
def create_spflow_spn(n_feats):
    gaussians1 = []
    gaussians2 = []
    for i in range(n_feats):
        g1 = Gaussian(np.random.randn(), np.random.rand(), scope=i)
        g2 = Gaussian(np.random.randn(), np.random.rand(), scope=i)
        gaussians1.append(g1)
        gaussians2.append(g2)

    prods1 = []
    prods2 = []
    for i in range(0, n_feats, 2):
        p1 = Product([gaussians1[i], gaussians1[i + 1]])
        p2 = Product([gaussians2[i], gaussians2[i + 1]])
        prods1.append(p1)
        prods2.append(p2)

    sums = []
    for i in range(n_feats // 2):
        s = Sum(weights=[0.5, 0.5], children=[prods1[i], prods2[i]])
        sums.append(s)

    spflow_spn = Product(sums)
    assign_ids(spflow_spn)
    rebuild_scopes_bottom_up(spflow_spn)
    return spflow_spn
예제 #2
0
def create_SPN2():
    from spn.structure.Base import assign_ids
    from spn.structure.Base import rebuild_scopes_bottom_up

    from spn.algorithms.Validity import is_valid
    from spn.structure.leaves.parametric.Parametric import Categorical

    from spn.structure.Base import Sum, Product

    p0 = Product(children=[
        Categorical(p=[0.3, 0.7], scope=1),
        Categorical(p=[0.4, 0.6], scope=2)
    ])
    p1 = Product(children=[
        Categorical(p=[0.5, 0.5], scope=1),
        Categorical(p=[0.6, 0.4], scope=2)
    ])
    s1 = Sum(weights=[0.3, 0.7], children=[p0, p1])
    p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1])
    p3 = Product(children=[
        Categorical(p=[0.2, 0.8], scope=0),
        Categorical(p=[0.3, 0.7], scope=1)
    ])
    p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)])
    spn = Sum(weights=[0.4, 0.6], children=[p2, p4])

    assign_ids(spn)
    rebuild_scopes_bottom_up(spn)

    val, msg = is_valid(spn)
    assert val, msg

    return spn
def naive_factorization(data=None,
                        node_id=0,
                        context=None,
                        scope=None,
                        **kwargs):
    assert scope is not None, "No scope"

    prod_node = Product()
    prod_node.scope = scope
    prod_node.id = node_id

    y, x = get_YX(data, context.feature_size)

    result = []
    for i, rv in enumerate(scope):
        prod_node.children.append(None)
        data_slice = concatenate_yx(y[:, i].reshape(-1, 1), x)
        result.append((
            SplittingOperations.CREATE_LEAF_NODE,
            {
                "data": data_slice,
                "parent_id": prod_node.id,
                "pos": len(prod_node.children) - 1,
                "scope": [rv],
            },
        ))

    return prod_node, result
def test_log_vector_histogram():
    # Construct a minimal SPN.
    h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=0)
    h2 = Histogram([0., 1., 2.], [0.45, 0.55], [1, 1], scope=1)
    h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=0)
    h4 = Histogram([0., 1., 2.], [0.875, 0.125], [1, 1], scope=1)

    p0 = Product(children=[h1, h2])
    p1 = Product(children=[h3, h4])
    spn = Sum([0.3, 0.7], [p0, p1])

    inputs = np.column_stack((
        np.random.randint(2, size=30),
        np.random.randint(2, size=30),
    )).astype("float64")

    if not CPUCompiler.isVectorizationSupported():
        print("Test not supported by the compiler installation")
        return 0

    # Execute the compiled Kernel.
    results = CPUCompiler(maxTaskSize=5).log_likelihood(spn, inputs, supportMarginal=False)

    # Compute the reference results using the inference from SPFlow.
    reference = log_likelihood(spn, inputs)
    reference = reference.reshape(30)

    # Check the computation results against the reference
    # Check in normal space if log-results are not very close to each other.
    assert np.all(np.isclose(results, reference)) or np.all(np.isclose(np.exp(results), np.exp(reference)))
def test_cpu_histogram():
    # Construct a minimal SPN.
    h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=0)
    h2 = Histogram([0., 3., 6., 8.], [0.35, 0.1, 0.55], [1, 1], scope=1)
    h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=0)
    h4 = Histogram([0., 5., 8.], [0.875, 0.125], [1, 1], scope=1)

    p0 = Product(children=[h1, h2])
    p1 = Product(children=[h3, h4])
    spn = Sum([0.3, 0.7], [p0, p1])

    inputs = np.column_stack((
        np.random.randint(2, size=30),
        np.random.randint(8, size=30),
    )).astype("float64")

    # Insert some NaN in random places into the input data.
    inputs.ravel()[np.random.choice(inputs.size, 5, replace=False)] = np.nan

    if not CUDACompiler.isAvailable():
        print("Test not supported by the compiler installation")
        return 0

    # Execute the compiled Kernel.
    results = CUDACompiler().log_likelihood(spn, inputs)

    # Compute the reference results using the inference from SPFlow.
    reference = log_likelihood(spn, inputs)
    reference = reference.reshape(30)

    # Check the computation results against the reference
    # Check in normal space if log-results are not very close to each other.
    assert np.all(np.isclose(results, reference)) or np.all(
        np.isclose(np.exp(results), np.exp(reference)))
예제 #6
0
def create_spflow_spn(n_feats, ctype=Gaussian):
    children1 = []
    children2 = []
    for i in range(n_feats):
        if ctype == Gaussian:
            c1 = Gaussian(np.random.randn(), np.random.rand(), scope=i)
            c2 = Gaussian(np.random.randn(), np.random.rand(), scope=i)
        else:
            #c1 = Bernoulli(p=1.0, scope=i)
            #c2 = Bernoulli(p=1.0, scope=i)
            c1 = Bernoulli(p=np.random.rand(), scope=i)
            c2 = Bernoulli(p=np.random.rand(), scope=i)

        children1.append(c1)
        children2.append(c2)

    prods1 = []
    prods2 = []
    for i in range(0, n_feats, 2):
        p1 = Product([children1[i], children1[i + 1]])
        p2 = Product([children2[i], children2[i + 1]])
        prods1.append(p1)
        prods2.append(p2)

    sums = []
    for i in range(n_feats // 2):
        s = Sum(weights=[0.5, 0.5], children=[prods1[i], prods2[i]])
        sums.append(s)

    spflow_spn = Product(sums)
    assign_ids(spflow_spn)
    rebuild_scopes_bottom_up(spflow_spn)
    return spflow_spn
 def _deserialize_product(self, node, node_map):
     child_ids = node.product.children
     # Resolve references to child nodes by ID.
     children = [node_map.get(id) for id in child_ids]
     # Check all childs have been resolved.
     assert None not in children, "Child node ID could not be resolved"
     product = Product(children = children)
     product.id = node.id
     return product
예제 #8
0
def prod_condition(node, children, input_vals=None, scope=None):
    if not scope.intersection(node.scope):
        return Copy(node), 0
    new_node = Product()
    new_node.scope = list(set(node.scope) - scope)
    probability = 0

    for c in children:
        if c[0]:
            new_node.children.append(c[0])
        probability += float(c[1])
    return new_node, probability
예제 #9
0
def SPN_Reshape(node, max_children=2):
    v, err = is_valid(node)
    assert v, err
    nodes = get_nodes_by_type(node, (Product, Sum))

    while len(nodes) > 0:
        n = nodes.pop()

        if len(n.children) <= max_children:
            continue

        # node has more than 2 nodes, create binary hierarchy
        new_children = []
        new_weights = []
        for i in range(0, len(n.children), max_children):
            children = n.children[i:i + max_children]

            if len(children) > 1:
                if isinstance(n, Product):
                    newChild = Product()
                    for c in children:
                        newChild.scope.extend(c.scope)
                    newChild.children.extend(children)
                    new_children.append(newChild)
                else:  # Sum
                    weights = n.weights[i:i + max_children]
                    branch_weight = sum(weights)
                    new_weights.append(branch_weight)

                    newChild = Sum()
                    newChild.scope.extend(children[0].scope)
                    newChild.children.extend(children)
                    newChild.weights.extend(
                        [w / branch_weight for w in weights])
                    newChild.weights[0] = 1.0 - sum(newChild.weights[1:])
                    new_children.append(newChild)
            else:
                new_children.extend(children)

                if isinstance(n, Sum):
                    new_weights.append(1.0 - sum(new_weights))

        n.children = new_children
        if isinstance(n, Sum):
            n.weights = new_weights
        nodes.append(n)

    assign_ids(node)
    v, err = is_valid(node)
    assert v, err
    return node
def remove_non_informative_features(data=None,
                                    node_id=0,
                                    scope=None,
                                    context=0,
                                    uninformative_features_idx=None,
                                    **kwargs):
    assert uninformative_features_idx is not None, "parameter uninformative_features_idx can't be None"

    prod_node = Product()
    prod_node.scope = scope
    prod_node.id = node_id

    y, x = get_YX(data, context.feature_size)

    non_zero_variance_rvs = []
    non_zero_variance_idx = []
    result = []
    for idx, zero_var in enumerate(uninformative_features_idx):
        rv = scope[idx]

        if not zero_var:
            non_zero_variance_rvs.append(rv)
            non_zero_variance_idx.append(idx)
            continue

        prod_node.children.append(None)
        data_slice = concatenate_yx(y[:, idx].reshape(-1, 1), x)
        result.append((
            SplittingOperations.CREATE_LEAF_NODE,
            {
                "data": data_slice,
                "parent_id": prod_node.id,
                "pos": len(prod_node.children) - 1,
                "scope": [rv],
            },
        ))
    assert len(result) > 0
    if len(non_zero_variance_idx) > 0:
        prod_node.children.append(None)
        result.append((
            SplittingOperations.GET_NEXT_OP,
            {
                "data": concatenate_yx(data[:, non_zero_variance_idx], x),
                "parent_id": prod_node.id,
                "pos": len(prod_node.children) - 1,
                "scope": non_zero_variance_rvs,
            },
        ))

    return prod_node, result
예제 #11
0
def test_cuda_categorical():
    # Construct a minimal SPN
    c1 = Categorical(p=[0.35, 0.55, 0.1], scope=0)
    c2 = Categorical(p=[0.25, 0.625, 0.125], scope=1)
    c3 = Categorical(p=[0.5, 0.2, 0.3], scope=2)
    c4 = Categorical(p=[0.6, 0.15, 0.25], scope=3)
    c5 = Categorical(p=[0.7, 0.11, 0.19], scope=4)
    c6 = Categorical(p=[0.8, 0.14, 0.06], scope=5)
    p = Product(children=[c1, c2, c3, c4, c5, c6])

    # Randomly sample input values.
    inputs = np.column_stack((
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
    )).astype("float64")

    if not CUDACompiler.isAvailable():
        print("Test not supported by the compiler installation")
        return 0

    # Execute the compiled Kernel.
    results = CUDACompiler().log_likelihood(p, inputs, supportMarginal=False)

    # Compute the reference results using the inference from SPFlow.
    reference = log_likelihood(p, inputs)
    reference = reference.reshape(30)

    # Check the computation results against the reference
    # Check in normal space if log-results are not very close to each other.
    assert np.all(np.isclose(results, reference)) or np.all(
        np.isclose(np.exp(results), np.exp(reference)))
예제 #12
0
    def build_recursive(dep_tree, table_keys, scopes, attribute_owners, path_constraints=None, cache=None):
        if path_constraints is None:
            path_constraints = []

        new_node = Sum()
        for (table_names_keys, dep_node) in get_dependncy_keys(dep_tree, table_keys, attribute_owners,
                                                               path_constraints):

            for constraint_configuration, cached_node_count in get_constraint_values(table_names_keys, path_constraints,
                                                                                     cache):
                p_node = Product()
                new_node.children.append(p_node)
                count_value = 1

                for cached_node, node_count in cached_node_count:
                    p_node.children.append(cached_node)
                    count_value *= node_count

                for dep_children_node in dep_node.children:
                    if dep_children_node.name[0] == '@':
                        continue

                    node, count = build_recursive(dep_children_node, table_keys, scopes, attribute_owners,
                                                  path_constraints=constraint_configuration,
                                                  cache=cache)
                    p_node.children.append(node)
                    count_value *= count
                new_node.weights.append(count_value)

        wsum = np.sum(new_node.weights)
        # new_node.weights = [w / wsum for w in new_node.weights]

        return new_node, wsum
def create_disj(data, scope, assignments, alpha):

    unq_data, counts = np.unique(data, axis=0, return_counts=True)
    probs = np.zeros(assignments.shape[0])
    for i in range(assignments.shape[0]):
        index = np.where(np.all(assignments[i] == unq_data, axis=1))[0]
        if len(index):
            probs[i] = counts[index[0]]
    probs = (probs + alpha) / (probs + alpha).sum()

    indicators = {
        var: [Bernoulli(scope=[var], p=0),
              Bernoulli(scope=[var], p=1)]
        for var in scope
    }

    prods = []
    for i in range(assignments.shape[0]):
        children = []
        for j in range(assignments.shape[1]):
            children.append(indicators[scope[j]][assignments[i, j]])
            # children.append(Bernoulli(scope=[scope[j]], p=assignments[i, j]))
        prods.append(Product(children=children))

    if len(prods) > 1:
        disj = Sum(children=prods, weights=probs)
    else:
        disj = prods[0]

    assign_ids(disj)
    rebuild_scopes_bottom_up(disj)

    return disj
def test_gaussian_leaf_serialization(tmpdir):
    """Tests the binary serialization of two SPFlow Gaussian leaf nodes
    by round-tripping and comparing the parameters before and after serialization
    & deserialization"""
    g1 = Gaussian(mean=0.5, stdev=1, scope=0)
    g2 = Gaussian(mean=0.125, stdev=0.25, scope=1)
    p = Product(children=[g1, g2])

    binary_file = os.path.join(tmpdir, "test.bin")
    print(f"Test binary file: {binary_file}")

    model = SPNModel(p, "float32", "test")
    query = JointProbability(model)

    BinarySerializer(binary_file).serialize_to_file(query)

    deserialized = BinaryDeserializer(binary_file).deserialize_from_file()

    assert (isinstance(deserialized, JointProbability))
    assert (isinstance(deserialized.graph, SPNModel))
    assert (deserialized.graph.featureType == model.featureType)
    assert (deserialized.graph.name == model.name)

    deserialized = deserialized.graph.root

    assert isinstance(deserialized, Product)
    assert (len(deserialized.children) == 2)
    gaussian1 = deserialized.children[0]
    gaussian2 = deserialized.children[1]
    assert (g1.scope == gaussian1.scope)
    assert (g1.mean == gaussian1.mean)
    assert (g1.stdev == gaussian1.stdev)
    assert (g2.scope == gaussian2.scope)
    assert (g2.mean == gaussian2.mean)
    assert (g2.stdev == gaussian2.stdev)
예제 #15
0
    def tree_to_spn(tree, features):
        tnode = tree.data

        if tnode == "sumnode":
            node = Sum()
            for i in range(int(len(tree.children) / 2)):
                j = 2 * i
                w, c = tree.children[j], tree.children[j + 1]
                node.weights.append(float(w))
                node.children.append(tree_to_spn(c, features))
            return node

        if tnode == "prodnode":
            if len(tree.children) == 1:
                return tree_to_spn(tree.children[0], features)
            node = Product()
            for c in tree.children:
                node.children.append(tree_to_spn(c, features))
            return node

        if tnode in str_to_spn_lambdas:
            return str_to_spn_lambdas[tnode][0](tree, features,
                                                str_to_spn_lambdas[tnode][2],
                                                tree_to_spn)

        raise Exception('Node type not registered: ' + tnode)
def test_categorical_leaf_serialization(tmpdir):
    """Tests the binary serialization of two SPFlow Categorical leaf nodes
    by round-tripping and comparing the parameters before and after serialization
    & deserialization"""
    c1 = Categorical(p=[0.35, 0.55, 0.1], scope=1)
    c2 = Categorical(p=[0.25, 0.625, 0.125], scope=2)
    p = Product(children=[c1, c2])

    binary_file = os.path.join(tmpdir, "test.bin")
    print(f"Test binary file: {binary_file}")

    model = SPNModel(p, "uint8", "test")
    query = JointProbability(model)

    BinarySerializer(binary_file).serialize_to_file(query)

    deserialized = BinaryDeserializer(binary_file).deserialize_from_file()

    assert (isinstance(deserialized, JointProbability))
    assert (isinstance(deserialized.graph, SPNModel))
    assert (deserialized.graph.featureType == model.featureType)
    assert (deserialized.graph.name == model.name)

    deserialized = deserialized.graph.root

    assert isinstance(deserialized, Product)
    assert (len(deserialized.children) == 2)
    assert len(c1.p) == len(deserialized.children[0].p)
    for i, p in enumerate(c1.p):
        assert p == deserialized.children[0].p[i]
    assert len(c2.p) == len(deserialized.children[1].p)
    for i, p in enumerate(c2.p):
        assert p == deserialized.children[1].p[i]
예제 #17
0
    def test_torch_vs_tf_time(self):
        # Create sample data
        from sklearn.datasets.samples_generator import make_blobs
        import tensorflow as tf
        from time import time

        X, y = make_blobs(n_samples=10,
                          centers=3,
                          n_features=2,
                          random_state=0)
        X = X.astype(np.float32)

        # SPFLow implementation
        g00 = Gaussian(mean=0.0, stdev=1.0, scope=0)
        g10 = Gaussian(mean=1.0, stdev=2.0, scope=1)
        g01 = Gaussian(mean=3.0, stdev=2.0, scope=0)
        g11 = Gaussian(mean=5.0, stdev=1.0, scope=1)
        p0 = Product(children=[g00, g10])
        p1 = Product(children=[g01, g11])
        s = Sum(weights=[0.2, 0.8], children=[p0, p1])
        assign_ids(s)
        rebuild_scopes_bottom_up(s)

        # Convert
        tf_spn, data_placeholder, variable_dict = spn_to_tf_graph(s, data=X)
        torch_spn = SumNode.from_spn(s)

        # Optimizer
        lr = 0.001
        tf_optim = tf.train.AdamOptimizer(lr)
        torch_optim = optim.Adam(torch_spn.parameters(), lr)

        t0 = time()
        epochs = 10
        optimize_tf_graph(tf_spn,
                          variable_dict,
                          data_placeholder,
                          X,
                          epochs=epochs,
                          optimizer=tf_optim)
        t1 = time()
        optimize_torch(torch_spn, X, epochs=epochs, optimizer=torch_optim)
        t2 = time()

        print("Tensorflow took: ", t1 - t0)
        print("PyTorch took: ", t2 - t1)
예제 #18
0
def create_product(data=None,
                   node_id=0,
                   parent_id=0,
                   pos=0,
                   context=None,
                   scope=None,
                   split_cols=None,
                   **kwargs):
    assert split_cols is not None, "No split_cols lambda"
    assert scope is not None, "No scope"
    data_slices = split_cols(data, context, scope)

    result = []

    if len(data_slices) == 1:
        result.append((
            SplittingOperations.GET_NEXT_OP,
            {
                "data": data,
                "parent_id": parent_id,
                "pos": pos,
                "no_independencies": True,
                "scope": scope,
            },
        ))
        return None, result

    node = Product()
    node.scope.extend(scope)
    node.id = node_id

    for data_slice, scope_slice, _ in data_slices:
        assert isinstance(scope_slice, list), "slice must be a list"

        node.children.append(None)
        result.append((
            SplittingOperations.GET_NEXT_OP,
            {
                "data": data_slice,
                "parent_id": node_id,
                "pos": len(node.children) - 1,
                "scope": scope_slice,
            },
        ))

    return node, result
예제 #19
0
    def test_sum(self):
        spn = Product()
        for s in range(7):
            spn.children.append(Leaf(scope=s))

        new_spn = SPN_Reshape(spn, 2)

        print(spn)
예제 #20
0
    def test_sum(self):
        spn = Product()
        for s in range(7):
            spn.children.append(Leaf(scope=s))

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)

        new_spn = SPN_Reshape(spn, 2)

        print(spn)
def create_conj(data, scope, alpha):

    conj = Product(children=[
        Bernoulli(scope=[scope[k]],
                  p=(data[0][k] * data.shape[0] + alpha) /
                  (data.shape[0] + 2 * alpha)) for k in range(len(scope))
    ])

    assign_ids(conj)
    rebuild_scopes_bottom_up(conj)

    return conj
예제 #22
0
    def __init__(self):
        p0 = Product(children=[
            Categorical(p=[0.3, 0.7], scope=1),
            Categorical(p=[0.4, 0.6], scope=2)
        ])
        p1 = Product(children=[
            Categorical(p=[0.5, 0.5], scope=1),
            Categorical(p=[0.6, 0.4], scope=2)
        ])
        s1 = Sum(weights=[0.3, 0.7], children=[p0, p1])
        p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1])
        p3 = Product(children=[
            Categorical(p=[0.2, 0.8], scope=0),
            Categorical(p=[0.3, 0.7], scope=1)
        ])
        p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)])

        self.spn = Sum(weights=[0.4, 0.6], children=[p2, p4])

        assign_ids(self.spn)
        rebuild_scopes_bottom_up(self.spn)
def test_binary_serialization_roundtrip(tmpdir):
    """Tests the binary serialization for SPFlow SPNs by round-tripping 
    a simple SPN through serialization and de-serialization and comparing
    the graph-structure before and after serialization & de-serialization."""
    h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=1)
    h2 = Histogram([0., 1., 2.], [0.45, 0.55], [1, 1], scope=2)
    h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=1)
    h4 = Histogram([0., 1., 2.], [0.875, 0.125], [1, 1], scope=2)

    p0 = Product(children=[h1, h2])
    p1 = Product(children=[h3, h4])
    spn = Sum([0.3, 0.7], [p0, p1])

    model = SPNModel(spn, featureValueType="uint32")
    query = JointProbability(model)

    binary_file = os.path.join(tmpdir, "test.bin")
    print(f"Test binary file: {binary_file}")

    BinarySerializer(binary_file).serialize_to_file(query)

    deserialized = BinaryDeserializer(binary_file).deserialize_from_file()

    assert (isinstance(deserialized, JointProbability))
    assert (deserialized.batchSize == query.batchSize)
    assert (deserialized.errorModel.error == query.errorModel.error)
    assert (deserialized.errorModel.kind == query.errorModel.kind)
    assert (deserialized.graph.featureType == model.featureType)
    assert (deserialized.graph.name == model.name)

    deserialized = deserialized.graph.root
    assert get_number_of_nodes(spn) == get_number_of_nodes(deserialized)
    assert get_number_of_nodes(spn,
                               Sum) == get_number_of_nodes(deserialized, Sum)
    assert get_number_of_nodes(spn, Product) == get_number_of_nodes(
        deserialized, Product)
    assert get_number_of_nodes(spn, Histogram) == get_number_of_nodes(
        deserialized, Histogram)
    assert get_number_of_edges(spn) == get_number_of_edges(deserialized)
예제 #24
0
def remove_non_informative_features(data=None,
                                    node_id=0,
                                    scope=None,
                                    **kwargs):
    prod_node = Product()
    prod_node.scope = scope
    prod_node.id = node_id

    uninformative_features_idx = np.var(data[:, scope], 0) == 0
    zero_variance_rvs = [s for s in scope]
    result = []
    for idx, zero_var in enumerate(uninformative_features_idx):
        if not zero_var:
            continue
        prod_node.children.append(None)
        rv = scope[idx]
        data_slice = data[:, rv].reshape(-1, 1)
        result.append((
            SplittingOperations.CREATE_LEAF_NODE,
            {
                "data": data_slice,
                "parent_id": node_id,
                "pos": len(prod_node.children) - 1,
                "scope": [rv],
            },
        ))
        del zero_variance_rvs[idx]
    assert len(result) > 0
    prod_node.children.append(None)
    result.append((
        SplittingOperations.GET_NEXT_OP,
        {
            "data": data[:, zero_variance_rvs],
            "parent_id": node_id,
            "pos": len(prod_node.children) - 1,
            "scope": zero_variance_rvs,
        },
    ))
    return prod_node, result
예제 #25
0
def naive_factorization(data=None, node_id=0, scope=None, **kwargs):
    assert scope is not None, "No scope"

    prod_node = Product()
    prod_node.scope = scope
    prod_node.node_id = node_id

    result = []
    for rv in scope:
        prod_node.children.append(None)
        data_slice = data[:, rv].reshape(-1, 1)
        result.append((
            SplittingOperations.CREATE_LEAF_NODE,
            {
                "data": data_slice,
                "parent_id": node_id,
                "pos": len(prod_node.children) - 1,
                "scope": [rv],
            },
        ))

    return prod_node, result
예제 #26
0
    def process_data(table_name, lower, higher, table, table_meta_data, scopes, keys_left, non_key_features, siblings,
                     cache):
        # dig into the constraints
        curr_att = keys_left[0]
        att_pos = table_meta_data[curr_att]

        constraint_table = cache.get(curr_att, None)
        if constraint_table is None:
            cache[curr_att] = constraint_table = {}
        else:
            assert False

        table = table[lower:higher]
        column = table[:, att_pos]
        vals, counts = np.unique(column, return_counts=True)
        for val, count in zip(vals, counts):
            l = np.searchsorted(column, val, side='left')
            h = np.searchsorted(column, val, side='right')

            node = None
            if attribute_owners[curr_att] == table_name and False:
                # if I'm the owner, we add the filter
                node = CategoricalDictionary(p={val: count}, scope=scopes[curr_att])
                node.att = curr_att
                node.debug = lambda self: "C_%s(%s,%s)" % (self.id, self.params, self.att)

            if len(keys_left) > 1:
                val_constraint_table = constraint_table.get(val, None)
                if val_constraint_table is None:
                    constraint_table[val] = val_constraint_table = {}
                else:
                    assert False

                new_siblings = list(siblings)
                if node is not None:
                    new_siblings += [node]
                process_data(table_name, l, h, table, table_meta_data, scopes, keys_left[1:], non_key_features,
                             new_siblings, val_constraint_table)
            else:
                p_node = Product()
                # p_node.debug = lambda self: "P_%s(%s)" % (self.id, ",".join([str(p) for p in self.children]))

                if val not in constraint_table:
                    constraint_table[val] = (p_node, count)
                else:
                    assert False

                p_node.children.extend(siblings)
                if node is not None:
                    p_node.children.append(node)
                p_node.children.extend(factorize_data(l, h, table, non_key_features))
예제 #27
0
def get_credit_spn():
    from spn.structure.Base import Product
    from spn.structure.leaves.parametric.Parametric import Categorical

    spn1 = Categorical(p=[0.0, 1.0], scope=[2]) * Categorical(p=[0.5, 0.5],
                                                              scope=[3])
    spn2 = Categorical(p=[1.0, 0.0], scope=[2]) * Categorical(p=[0.1, 0.9],
                                                              scope=[3])
    spn3 = 0.3 * spn1 + 0.7 * spn2
    spn4 = Categorical(p=[0.0, 1.0], scope=[1]) * spn3

    spn6 = Product([
        Categorical(p=[1.0, 0.0], scope=[1]),
        Categorical(p=[0.0, 1.0], scope=[2]),
        Categorical(p=[1.0, 0.0], scope=[3])
    ])
    spn6.scope = [1, 2, 3]

    spn7 = 0.8 * spn4 + 0.2 * spn6
    spn = spn7 * Categorical(p=[0.2, 0.8], scope=[0])

    spn.scope = sorted(spn.scope)
    return spn
예제 #28
0
    def test_equal_to_tf(self):
        # SPFLow implementation
        g00 = Gaussian(mean=0.0, stdev=1.0, scope=0)
        g10 = Gaussian(mean=1.0, stdev=2.0, scope=1)
        g01 = Gaussian(mean=3.0, stdev=2.0, scope=0)
        g11 = Gaussian(mean=5.0, stdev=1.0, scope=1)
        p0 = Product(children=[g00, g10])
        p1 = Product(children=[g01, g11])
        s = Sum(weights=[0.2, 0.8], children=[p0, p1])

        assign_ids(s)
        rebuild_scopes_bottom_up(s)

        # Test for 100 random samples
        data = np.random.randn(100, 2)

        # LL from SPN
        ll = log_likelihood(s, data)

        # PyTorch implementation
        g00 = GaussianNode(mean=0.0, std=1.0, scope=0)
        g10 = GaussianNode(mean=1.0, std=2.0, scope=1)
        g01 = GaussianNode(mean=3.0, std=2.0, scope=0)
        g11 = GaussianNode(mean=5.0, std=1.0, scope=1)
        p0 = ProductNode(children=[g00, g10])
        p1 = ProductNode(children=[g01, g11])
        rootnode = SumNode(weights=[0.2, 0.8], children=[p0, p1])

        datatensor = torch.Tensor(data)
        # LL from pytorch
        ll_torch = rootnode(datatensor)

        # Assert equality
        self.assertTrue(
            np.isclose(np.array(ll).squeeze(),
                       ll_torch.detach().numpy(),
                       atol=DELTA).all())
예제 #29
0
 def create_flat_spn_recursive(node, distribution_mix, prob=1.0, independent_nodes=[]):
     
     if isinstance(node, Sum):
         for i, c in enumerate(node.children):
             forwarded_weight = node.weights[i] * prob
             create_flat_spn_recursive(c, distribution_mix, forwarded_weight, independent_nodes.copy())
     
     elif isinstance(node, Product):
         
         stop = False
         next_node = None
         
         for c in node.children:
             if target_id in c.scope:
                 if len(c.scope) == 1:
                     stop = True
                     independent_nodes.append(deepcopy(c))
                 else:
                     next_node = c
             else:
                 for feature_id in c.scope:
                     weighted_nodes = get_nodes_with_weight(c, feature_id)
                     t_node = type(weighted_nodes[0][1])
                     mixed_node = distribution_mix[t_node](weighted_nodes)
                     independent_nodes.append(mixed_node)
         
         if stop:
             flat_spn.weights.append(prob)
             prod = Product(children=independent_nodes)
             prod.scope = spn.scope
             flat_spn.children.append(prod)
             
         else:
             create_flat_spn_recursive(next_node, distribution_mix, prob, independent_nodes)
             
     else:
         raise Exception("Can only iterate over Sum and Product nodes")
def create_naive_fact(data, scope, alpha):
    """
    It returns a naive factorization of the data.
    Laplace's correction is not needed, but if not used may cause underflow.
    """

    probs = (np.sum(data, axis=0) + alpha) / (data.shape[0] + 2 * alpha)

    naive_fact = Product(children=[
        Bernoulli(p=probs[k], scope=[scope[k]]) for k in range(len(scope))
    ])

    assign_ids(naive_fact)
    rebuild_scopes_bottom_up(naive_fact)

    return naive_fact