def get_structure_stats(node): num_nodes = len(get_nodes_by_type(node, Node)) sum_nodes = get_nodes_by_type(node, Sum) n_sum_nodes = len(sum_nodes) n_prod_nodes = len(get_nodes_by_type(node, Product)) leaf_nodes = get_nodes_by_type(node, Leaf) n_leaf_nodes = len(leaf_nodes) edges = get_number_of_edges(node) layers = get_depth(node) params = 0 for n in sum_nodes: params += len(n.children) for l in leaf_nodes: params += len(l.parameters) return """---Structure Statistics--- # nodes %s # sum nodes %s # prod nodes %s # leaf nodes %s # params %s # edges %s # layers %s""" % ( num_nodes, n_sum_nodes, n_prod_nodes, n_leaf_nodes, params, edges, layers, )
def get_deepdb_size(spn_ensemble): # only deal with single table, only have one spn spn = spn_ensemble.spns[0].mspn size = 0 nodes = get_nodes_by_type(spn, Product) for node in nodes: size += len(node.children) + len(node.scope) nodes = get_nodes_by_type(spn, Sum) for node in nodes: assert len(node.children) == len(node.weights) == len( node.cluster_centers) assert len(node.cluster_centers[0]) == len(node.scope) num_child = len(node.children) num_var = len(node.scope) size += 2 * num_child + num_var + num_var * num_child # children, weights, scope, cluster_centers nodes = get_nodes_by_type(spn, Categorical) for node in nodes: assert len(node.scope) == 1 size += 2 + len(node.p) # scope, cardinality, p nodes = get_nodes_by_type(spn, IdentityNumericLeaf) for node in nodes: assert len(node.scope) == 1 assert len(node.unique_vals) + 1 == len(node.prob_sum) size += 3 + len(node.unique_vals) + len( node.prob_sum ) # scope, cardinality, null_value_prob, uniqe_vals, prob_sum # assume use 4 bytes to store all integers and floats return size * 4 / 1024 / 1024 #MB
def get_execution_layers(spn): all_nodes = set(get_nodes_by_type(spn, ntype=(Sum, Product))) next_filter_type = Product leaves = get_nodes_by_type(spn, Leaf) layers = [np.asarray([n.id for n in leaves])] layer_types = [Leaf] seen_nodes = set(leaves) while len(all_nodes) > 0: filtered_nodes = [] new_all_nodes = set() filter_type = next_filter_type for n in all_nodes: if isinstance(n, filter_type) and set(n.children).issubset(seen_nodes): filtered_nodes.append(n) else: new_all_nodes.add(n) if filter_type == Product: next_filter_type = Sum else: next_filter_type = Product if len(filtered_nodes) == 0: continue assert all_nodes == new_all_nodes | set(filtered_nodes) layer_types.append(filter_type) all_nodes = new_all_nodes layers.append(np.asarray([n.id for n in filtered_nodes])) seen_nodes.update(filtered_nodes) return layers, layer_types
def get_structure_stats_dict(node): node_types = dict(Counter([type(n) for n in get_nodes_by_type(node)])) num_nodes = len(get_nodes_by_type(node, Node)) edges = get_number_of_edges(node) layers = get_number_of_layers(node) return { 'nodes': num_nodes, 'edges': edges, 'layers': layers }.update(node_types)
def run_experiment(exp, spn, test_data, test_type, exp_lambda): outprefix = path + "/spns/%s/" % (exp) results_file = "%stime_test_%s_ll_%s.txt" % (outprefix, test_type, OS_name) if os.path.isfile(results_file): return print(exp, test_data.shape, test_type) ll, test_time = exp_lambda() np.savetxt(results_file, ll, delimiter=";") import cpuinfo machine = cpuinfo.get_cpu_info()["brand"] adds, muls = fpga_count_ops(spn) test_n = test_data.shape[0] results = OrderedDict() results["Experiment"] = exp results["OS"] = OS_name results["machine"] = machine results["test type"] = test_type results["expected adds"] = adds results["expected muls"] = muls results["input rows"] = test_n results["input cols"] = test_data.shape[1] results["spn nodes"] = len(get_nodes_by_type(spn, Node)) results["spn sum nodes"] = len(get_nodes_by_type(spn, Sum)) results["spn prod nodes"] = len(get_nodes_by_type(spn, Product)) results["spn leaves"] = len(get_nodes_by_type(spn, Leaf)) results["spn edges"] = get_number_of_edges(spn) results["spn layers"] = get_number_of_layers(spn) results["time per task"] = test_time results["time per instance"] = test_time / test_n results["avg ll"] = np.mean(ll, dtype=np.float128) results_file_name = "results.csv" if not os.path.isfile(results_file_name): results_file = open(results_file_name, "w") results_file.write(";".join(results.keys())) results_file.write("\n") else: results_file = open(results_file_name, "a") results_file.write(";".join(map(str, results.values()))) results_file.write("\n") results_file.close()
def get_structure_stats_dict(node): node_types = dict(Counter([type(n) for n in get_nodes_by_type(node)])) num_nodes = len(get_nodes_by_type(node, Node)) edges = get_number_of_edges(node) layers = get_depth(node) result = { "nodes": num_nodes, "edges": edges, "layers": layers, "count_per_type": node_types } return result
def get_structure_stats(node): num_nodes = len(get_nodes_by_type(node, Node)) sum_nodes = len(get_nodes_by_type(node, Sum)) prod_nodes = len(get_nodes_by_type(node, Product)) leaf_nodes = len(get_nodes_by_type(node, Leaf)) edges = get_number_of_edges(node) layers = get_number_of_layers(node) return """---Structure Statistics--- # nodes %s # sum nodes %s # prod nodes %s # leaf nodes %s # edges %s # layers %s""" % (num_nodes, sum_nodes, prod_nodes, leaf_nodes, edges, layers)
def evaluate_spn_statistics(spn_path, target_csv_path, build_time_path): csv_list = [] # SPN learn times for filename in os.listdir(spn_path): logger.debug(f'Reading {filename}') if not filename.startswith("ensemble") or filename.endswith('.zip'): continue spn_ensemble = read_ensemble(os.path.join(spn_path, filename)) for spn in spn_ensemble.spns: num_nodes = len(get_nodes_by_type(spn.mspn, Node)) upper_bound = 200 * len(spn.column_names) - 1 # assert num_nodes <= upper_bound, "Num of nodes upper bound is wrong" csv_list.append((filename, spn.learn_time, spn.full_sample_size, spn.min_instances_slice, spn.rdc_threshold, len(spn.relationship_set), len(spn.table_set), " - ".join([table for table in spn.table_set]), len(spn.column_names), num_nodes, upper_bound)) # HDF create times with open(build_time_path) as f: hdf_preprocessing_time = int(f.readlines()[0]) csv_list += [('generate_hdf', hdf_preprocessing_time, 0, 0, 0, 0, 0, "")] with open(target_csv_path, 'w', newline='') as f: writer = csv.writer(f) writer.writerow([ 'filename', 'learn_time', 'full_sample_size', 'min_instances_slice', 'rdc_threshold', 'no_joins', 'no_tables', 'tables', 'no_columns', 'structure_stats', 'upper_bound' ]) writer.writerows(csv_list)
def EM_optimization(spn, data, iterations=5, node_updates=_node_updates, **kwargs): for _ in range(iterations): lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn))) # one pass bottom up evaluating the likelihoods log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node) gradients = gradient_backward(spn, lls_per_node) R = lls_per_node[:, 0] for node_type, func in node_updates.items(): # TODO: do in parallel for node in get_nodes_by_type(spn, node_type): func(node, node_lls=lls_per_node[:, node.id], node_gradients=gradients[:, node.id], root_lls=R, all_lls=lls_per_node, all_gradients=gradients, data=data, **kwargs)
def is_structured_decomposable(spn, verbose=False): if not is_consistent(spn): return False nodes = get_nodes_by_type(spn) scope_set = set() for n in nodes: if isinstance(n, Product): scope_set.add(tuple(n.scope)) elif isinstance(n, CLTree): vtree = from_dtree_to_vtree(n.dtree_root) scope_set.update([tuple(s) for s in vtree.scopes]) scopes = list(scope_set) scopes = [set(t) for t in scopes] # # ordering is not needed, but useful for printing if verbose: scopes.sort(key=len) for s in scopes: print(s) for i in range(len(scopes)): for j in range(len(scopes)): int_len = len(scopes[i].intersection(scopes[j])) if int_len != 0 and int_len != min(len(scopes[i]), len(scopes[j])): return False return True
def meu(root, input_data, node_bottom_up_meu=_node_bottom_up_meu, in_place=False): # valid, err = is_valid(node) # assert valid, err if in_place: data = input_data else: data = np.copy(input_data) nodes = get_nodes_by_type(root) utility_scope = set() for node in nodes: if type(node) is Utility: utility_scope.add(node.scope[0]) assert np.all(np.isnan(data[:, list(utility_scope)]) ), "Please specify all utility values as np.nan" likelihood_per_node = np.zeros((data.shape[0], len(nodes))) meu_per_node = np.zeros((data.shape[0], len(nodes))) meu_per_node.fill(np.nan) # one pass bottom up evaluating the likelihoods likelihood(root, data, dtype=data.dtype, lls_matrix=likelihood_per_node) eval_spmn_bottom_up_meu(root, _node_bottom_up_meu, meu_per_node=meu_per_node, data=data, lls_per_node=likelihood_per_node) result = meu_per_node[:, root.id] return result
def best_next_decision(root, input_data, in_place=False): if in_place: data = input_data else: data = np.copy(input_data) nodes = get_nodes_by_type(root) dec_dict = {} # find all possible decision values for node in nodes: if type(node) == Max: if node.dec_idx in dec_dict: dec_dict[node.dec_idx].union(set(node.dec_values)) else: dec_dict[node.dec_idx] = set(node.dec_values) next_dec_idx = None # find next undefined decision for idx in dec_dict.keys(): if np.all(np.isnan(data[:, idx])): next_dec_idx = idx break assert next_dec_idx != None, "please assign all values of next decision to np.nan" # determine best decisions based on meu dec_vals = list(dec_dict[next_dec_idx]) best_decisions = np.full((1, data.shape[0]), dec_vals[0]) data[:, next_dec_idx] = best_decisions meu_best = meu(root, data) for i in range(1, len(dec_vals)): decisions_i = np.full((1, data.shape[0]), dec_vals[i]) data[:, next_dec_idx] = decisions_i meu_i = meu(root, data) best_decisions = np.select([np.greater(meu_i, meu_best), True], [decisions_i, best_decisions]) meu_best = np.maximum(meu_i, meu_best) return best_decisions
def generate_adhoc_value_dict(spn): val_dict = {} for leaf in get_nodes_by_type(spn, Leaf): assert (len(leaf.scope) == 1) feature_id = leaf.scope[0] if feature_id in val_dict: if val_dict[feature_id][0] == "numeric": v_min, v_max = _get_min_max_numeric_from_leaf(leaf) if v_min < val_dict[feature_id][2][0]: val_dict[feature_id][2][0] = v_min if v_max > val_dict[feature_id][2][1]: val_dict[feature_id][2][1] = v_max else: if isinstance(leaf, Categorical): val_dict[feature_id] = [ "discrete", "Attr_" + str(feature_id), {i: str(i) for i in range(len(leaf.p))} ] elif isinstance(leaf, Gaussian) or isinstance( leaf, PiecewiseLinear) or isinstance( leaf, IdentityNumeric): val_dict[feature_id] = [ "numeric", "Attr_" + str(feature_id), _get_min_max_numeric_from_leaf(leaf) ] else: raise Exception("Cannot process node-type: " + str(leaf)) return val_dict
def meu(node, input_data, node_top_down_meu=_node_top_down_meu, node_bottom_up_meu=_node_bottom_up_meu, in_place=False): valid, err = is_valid(node) assert valid, err if in_place: data = input_data else: data = np.array(input_data) nodes = get_nodes_by_type(node) lls_per_node = np.zeros((data.shape[0], len(nodes))) # one pass bottom up evaluating the likelihoods # log_likelihood(node, data, dtype=data.dtype, node_log_likelihood=node_bottom_up_meu, lls_matrix=lls_per_node) likelihood(node, data, dtype=data.dtype, node_likelihood=node_bottom_up_meu, lls_matrix=lls_per_node) meu_val = lls_per_node[:, 0] instance_ids = np.arange(data.shape[0]) # one pass top down to decide on the max branch until it reaches a leaf; returns all_result, decisions at each max node for each instance. all_result, all_decisions = eval_spn_top_down_meu(node, node_top_down_meu, parent_result=instance_ids, data=data, lls_per_node=lls_per_node) decisions = merge_rows_for_decisions(all_decisions) return meu_val, decisions
def is_consistent(node): ''' all children of a product node have different scope ''' assert node is not None allchildscope = set() for prod_node in reversed(get_nodes_by_type(node, Product)): nscope = set(prod_node.scope) if len(prod_node.children) == 0: return False, "Product node %s has no children" % (prod_node.id) allchildscope.clear() sum_features = 0 for child in prod_node.children: sum_features += len(child.scope) allchildscope.update(child.scope) if allchildscope != nscope or sum_features != len(allchildscope): return False, "children of (prod) node %s do not have exclusive scope" % ( prod_node.id) return True, None
def _get_networkx_obj(spn): import networkx as nx from spn.structure.Base import Sum, Product, Leaf, get_nodes_by_type import numpy as np all_nodes = get_nodes_by_type(spn) logger.info(all_nodes) g = nx.Graph() labels = {} for n in all_nodes: if isinstance(n, Sum): label = "+" elif isinstance(n, Product): label = "x" else: label = "V" + str(n.scope[0]) g.add_node(n.id) labels[n.id] = label if isinstance(n, Leaf): continue for i, c in enumerate(n.children): edge_label = "" if isinstance(n, Sum): edge_label = np.round(n.weights[i], 2) g.add_edge(c.id, n.id, weight=edge_label) return g, labels
def EM_optimization(spn, data, iterations=5, node_updates=_node_updates, skip_validation=False, **kwargs): if not skip_validation: valid, err = is_valid(spn) assert valid, "invalid spn: " + err lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn))) for _ in range(iterations): # one pass bottom up evaluating the likelihoods log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node) gradients = gradient_backward(spn, lls_per_node) R = lls_per_node[:, 0] for node_type, func in node_updates.items(): for node in get_nodes_by_type(spn, node_type): func( node, node_lls=lls_per_node[:, node.id], node_gradients=gradients[:, node.id], root_lls=R, all_lls=lls_per_node, all_gradients=gradients, data=data, **kwargs )
def is_valid(node, check_ids=True): if check_ids: val, err = has_valid_ids(node) if not val: return val, err for n in get_nodes_by_type(node): if len(n.scope) == 0: return False, "node %s has no scope" % (n.id) is_sum = isinstance(n, Sum) is_prod = isinstance(n, Product) if is_sum: if len(n.children) != len(n.weights): return False, "node %s has different children/weights" % (n.id) if is_sum or is_prod: if len(n.children) == 0: return False, "node %s has no children" % (n.id) a, err = is_consistent(node) if not a: return a, err b, err = is_complete(node) if not b: return b, err return True, None
def feature_gradient(node, data, node_gradient_functions=_node_feature_gradients, lls_per_node=None): """ Feature gradients are computed for the input query and each feature using the backwards automatic differentiation. In mathematicl terms, it computes the partial derivatives \partial P(X) / \partial X_i :param node: Node for the gradient calculation :param data: data for the computation. NaN values are implicitely marginalized out :param lls_per_node: optional for storing the intermediate results """ all_leaves = get_nodes_by_type(node, Leaf) if not lls_per_node: lls_per_node = np.full((data.shape[0], get_number_of_nodes(node)), np.nan) log_likelihood(node, data, lls_matrix=lls_per_node) gradients = np.exp(gradient_backward(node, lls_per_node)) node_gradients = [] for spn_node in all_leaves: i = spn_node.id result = node_gradient_functions[type(spn_node)](spn_node, data) node_gradients.append(result * gradients[:, i].reshape(-1, 1)) node_gradients = np.array(node_gradients) return np.nansum(node_gradients, axis=0)
def sample_instances(node, input_data, rand_gen, node_sampling=_node_sampling, in_place=False): """ Implementing hierarchical sampling """ # first, we do a bottom-up pass to compute the likelihood taking into account marginals. # then we do a top-down pass, to sample taking into account the likelihoods. if in_place: data = input_data else: data = np.array(input_data) valid, err = is_valid(node) assert valid, err assert np.all( np.any(np.isnan(data), axis=1)), "each row must have at least a nan value where the samples will be substituted" nodes = get_nodes_by_type(node) lls_per_node = np.zeros((data.shape[0], len(nodes))) log_likelihood(node, data, dtype=data.dtype, lls_matrix=lls_per_node) instance_ids = np.arange(data.shape[0]) eval_spn_top_down(node, node_sampling, input_vals=instance_ids, data=data, lls_per_node=lls_per_node, rand_gen=rand_gen) return data
def mpe( node, input_data, node_top_down_mpe=_node_top_down_mpe, node_bottom_up_mpe_log=_node_bottom_up_mpe_log, in_place=False, ): valid, err = is_valid(node) assert valid, err assert np.all( np.any(np.isnan(input_data), axis=1) ), "each row must have at least a nan value where the samples will be substituted" if in_place: data = input_data else: data = np.array(input_data) nodes = get_nodes_by_type(node) lls_per_node = np.zeros((data.shape[0], len(nodes))) # one pass bottom up evaluating the likelihoods log_likelihood(node, data, dtype=data.dtype, node_log_likelihood=node_bottom_up_mpe_log, lls_matrix=lls_per_node) instance_ids = np.arange(data.shape[0]) # one pass top down to decide on the max branch until it reaches a leaf, then it fills the nan slot with the mode eval_spn_top_down(node, node_top_down_mpe, parent_result=instance_ids, data=data, lls_per_node=lls_per_node) return data
def __get_networkx_obj(spn): import networkx as nx all_nodes = get_nodes_by_type(spn) g = nx.Graph() labels = {} for n in all_nodes: if isinstance(n, Sum): label = "+\n{}".format(n.scope) elif isinstance(n, Product): label = "x" elif isinstance(n, Gaussian): label = "G" + str(n.scope[0]) + "\n(" + str(round( n.mean, 2)) + ", " + str(round(n.stdev, 2)) + ")" elif isinstance(n, Categorical): vals = [round(x, 2) for x in n.p] label = "C" + str(n.scope[0]) + " (" + str(vals) + ")" else: label = "Unk" + str(n.scope[0]) g.add_node(n.id) labels[n.id] = label if isinstance(n, Leaf): continue for i, c in enumerate(n.children): edge_label = "" if isinstance(n, Sum): edge_label = np.round(n.weights[i], 2) g.add_edge(c.id, n.id, weight=edge_label) return g, labels
def Expectation(spn, feature_id, ranges, node_expectation, node_likelihood): def leaf_expectation(node, data, dtype=np.float64, **kwargs): if node.scope[0] == feature_id: t_node = type(node) if t_node in node_expectation: exps = np.zeros((data.shape[0], 1), dtype=dtype) exps[:] = node_expectation[t_node](node) return exps else: raise Exception("Node type unknown for expectation: " + str(t_node)) else: t_node = type(node) if t_node in node_likelihood: return node_likelihood[t_node](node, ranges, node_likelihood=node_likelihood) node_expectations = { type(leaf): leaf_expectation for leaf in get_nodes_by_type(spn, Leaf) } node_expectations.update({Sum: sum_likelihood, Product: prod_likelihood}) expectation = likelihood(spn, ranges, node_likelihood=node_expectations) expectation = expectation / likelihood( spn, ranges, node_likelihood=node_likelihood) return expectation
def build_spn(dep_tree, table_keys, scopes, attribute_owners, path_constraints=None, cache=None): def build_recursive(dep_tree, table_keys, scopes, attribute_owners, path_constraints=None, cache=None): if path_constraints is None: path_constraints = [] new_node = Sum() for (table_names_keys, dep_node) in get_dependncy_keys(dep_tree, table_keys, attribute_owners, path_constraints): for constraint_configuration, cached_node_count in get_constraint_values(table_names_keys, path_constraints, cache): p_node = Product() new_node.children.append(p_node) count_value = 1 for cached_node, node_count in cached_node_count: p_node.children.append(cached_node) count_value *= node_count for dep_children_node in dep_node.children: if dep_children_node.name[0] == '@': continue node, count = build_recursive(dep_children_node, table_keys, scopes, attribute_owners, path_constraints=constraint_configuration, cache=cache) p_node.children.append(node) count_value *= count new_node.weights.append(count_value) wsum = np.sum(new_node.weights) # new_node.weights = [w / wsum for w in new_node.weights] return new_node, wsum root, count = build_recursive(dep_tree, table_keys, scopes, attribute_owners, path_constraints=path_constraints, cache=cache) if True: for sum_node in get_nodes_by_type(root, Sum): normalization = np.sum(sum_node.weights) sum_node.weights = [w / normalization for w in sum_node.weights] for cat_node in get_nodes_by_type(root, CategoricalDictionary): psum = 0 for name, count in cat_node.p.items(): psum += count cat_node.p = {name: count / psum for name, count in cat_node.p.items()} return root
def test_ll_matrix(self): add_node_likelihood(Leaf, sum_and_multiplier_ll) node_1_1_1_1 = leaf(2, 1) node_1_1_1_2 = leaf(2, 2) node_1_1_1 = 0.7 * node_1_1_1_1 + 0.3 * node_1_1_1_2 node_1_1_2 = leaf([0, 1], 3) node_1_1 = node_1_1_1 * node_1_1_2 node_1_2_1_1_1 = leaf(0, 5) node_1_2_1_1_2 = leaf(1, 4) node_1_2_1_1 = node_1_2_1_1_1 * node_1_2_1_1_2 node_1_2_1_2 = leaf([0, 1], 6) node_1_2_1 = 0.1 * node_1_2_1_1 + 0.9 * node_1_2_1_2 node_1_2_2 = leaf(2, 3) node_1_2 = node_1_2_1 * node_1_2_2 spn = 0.4 * node_1_1 + 0.6 * node_1_2 assign_ids(spn) max_id = max([n.id for n in get_nodes_by_type(spn)]) data = np.random.rand(10, 10) node_1_1_1_1_r = data[:, 2] * 1 node_1_1_1_2_r = data[:, 2] * 2 node_1_1_1_r = 0.7 * node_1_1_1_1_r + 0.3 * node_1_1_1_2_r node_1_1_2_r = 3 * (data[:, 0] + data[:, 1]) node_1_1_r = node_1_1_1_r * node_1_1_2_r node_1_2_1_1_1_r = data[:, 0] * 5 node_1_2_1_1_2_r = data[:, 1] * 4 node_1_2_1_1_r = node_1_2_1_1_1_r * node_1_2_1_1_2_r node_1_2_1_2_r = 6 * (data[:, 0] + data[:, 1]) node_1_2_1_r = 0.1 * node_1_2_1_1_r + 0.9 * node_1_2_1_2_r node_1_2_2_r = data[:, 2] * 3 node_1_2_r = node_1_2_1_r * node_1_2_2_r spn_r = 0.4 * node_1_1_r + 0.6 * node_1_2_r self.assert_correct(spn, data, spn_r) lls = np.zeros((data.shape[0], max_id + 1)) likelihood(spn, data, lls_matrix=lls) llls = np.zeros((data.shape[0], max_id + 1)) log_likelihood(spn, data, lls_matrix=llls) self.assertTrue(np.alltrue(np.isclose(lls, np.exp(llls)))) self.assertTrue(np.alltrue(np.isclose(spn_r, lls[:, spn.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_2_r, lls[:, node_1_2.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_2_2_r, lls[:, node_1_2_2.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_2_1_r, lls[:, node_1_2_1.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_2_1_2_r, lls[:, node_1_2_1_2.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_r, lls[:, node_1_2_1_1.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_2_r, lls[:, node_1_2_1_1_2.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_1_r, lls[:, node_1_2_1_1_1.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_1_r, lls[:, node_1_1.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_1_2_r, lls[:, node_1_1_2.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_1_1_r, lls[:, node_1_1_1.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_1_1_2_r, lls[:, node_1_1_1_2.id]))) self.assertTrue(np.alltrue(np.isclose(node_1_1_1_1_r, lls[:, node_1_1_1_1.id])))
def Expectation(spn, feature_scope, evidence_scope, evidence, node_expectation=_node_expectation): """Compute the Expectation: E[X_feature_scope | X_evidence_scope] given the spn and the evidence data Keyword arguments: spn -- the spn to compute the probabilities from feature_scope -- set() of integers, the scope of the features to get the expectation from evidence_scope -- set() of integers, the scope of the evidence features evidence -- numpy 2d array of the evidence data """ if evidence_scope is None: evidence_scope = set() assert not (len(evidence_scope) > 0 and evidence is None) assert len(feature_scope.intersection(evidence_scope)) == 0 marg_spn = marginalize(spn, keep=feature_scope | evidence_scope) def leaf_expectation(node, data, dtype=np.float64, **kwargs): if node.scope[0] in feature_scope: t_node = type(node) if t_node in node_expectation: exps = np.zeros((data.shape[0], 1), dtype=dtype) exps[:] = node_expectation[t_node](node) return exps else: raise Exception('Node type unknown: ' + str(t_node)) return likelihood(node, evidence) node_expectations = { type(leaf): leaf_expectation for leaf in get_nodes_by_type(marg_spn, Leaf) } node_expectations.update({Sum: sum_likelihood, Product: prod_likelihood}) if evidence is None: #fake_evidence is not used fake_evidence = np.zeros((1, len(spn.scope))).reshape(1, -1) expectation = likelihood(marg_spn, fake_evidence, node_likelihood=node_expectations) return expectation #if we have evidence, we want to compute the conditional expectation expectation = likelihood(marg_spn, evidence, node_likelihood=node_expectations) expectation = expectation / likelihood( marginalize(marg_spn, keep=evidence_scope), evidence) return expectation
def plot_spn2(spn, fname="plot.pdf"): import networkx as nx from networkx.drawing.nx_pydot import graphviz_layout import matplotlib.pyplot as plt from spn.structure.Base import Sum, Product, Leaf, get_nodes_by_type import numpy as np all_nodes = get_nodes_by_type(spn) g = nx.DiGraph() labels = {} edge_labels = {} for n in all_nodes: if isinstance(n, Sum): label = "+" elif isinstance(n, Product): label = "x" else: label = "V" + str(n.scope[0]) g.add_node(n.id) labels[n.id] = label if isinstance(n, Leaf): continue for i, c in enumerate(n.children): edge_label = "" if isinstance(n, Sum): edge_label = np.round(n.weights[i], 2) g.add_edge(c.id, n.id, weight=edge_label) pos = graphviz_layout(g, prog='dot', args="height=200") #pos = nx.drawing.layout.rescale_layout(pos, 10) plt.figure(figsize=(18, 12)) ax = plt.gca() ax.invert_yaxis() nx.draw(g, pos, with_labels=True, arrows=False, node_color='#DDDDDD', edge_color='#DDDDDD', width=1, node_size=250, labels=labels, font_size=6) ax.collections[0].set_edgecolor("#888888") edge_labels = nx.draw_networkx_edge_labels( g, pos=pos, edge_labels=nx.get_edge_attributes(g, 'weight'), font_size=5, clip_on=False, alpha=0.6) plt.tight_layout() plt.savefig(fname)
def validate_ids(node): all_nodes = get_nodes_by_type(node) ids = set() for n in all_nodes: ids.add(n.id) assert len(ids) == len(all_nodes), "not all nodes have ID's" assert min(ids) == 0 and max(ids) == len(ids) - 1, "ID's are not in order"
def python_eval_func(data): num_nodes = len(get_nodes_by_type(node)) # It has to be this way - otherwise the data doesn't appear contiguous in CPP. # np.ascontiguousarray doesn't seem to work either. results = [] for _ in range(num_nodes): results += np.zeros(shape=(data.shape[0]), dtype="float32").tolist() results = np.array(results).reshape((data.shape[0], num_nodes)) spn_many(data, results, results.shape[0]) return results
def init_spn_sampling(node): all_nodes = get_nodes_by_type(node) map_id_nodes = {} for n in all_nodes: map_id_nodes[n.id] = n reset_node_counters(node) return map_id_nodes