def create_disj(data, scope, assignments, alpha): unq_data, counts = np.unique(data, axis=0, return_counts=True) probs = np.zeros(assignments.shape[0]) for i in range(assignments.shape[0]): index = np.where(np.all(assignments[i] == unq_data, axis=1))[0] if len(index): probs[i] = counts[index[0]] probs = (probs + alpha) / (probs + alpha).sum() indicators = { var: [Bernoulli(scope=[var], p=0), Bernoulli(scope=[var], p=1)] for var in scope } prods = [] for i in range(assignments.shape[0]): children = [] for j in range(assignments.shape[1]): children.append(indicators[scope[j]][assignments[i, j]]) # children.append(Bernoulli(scope=[scope[j]], p=assignments[i, j])) prods.append(Product(children=children)) if len(prods) > 1: disj = Sum(children=prods, weights=probs) else: disj = prods[0] assign_ids(disj) rebuild_scopes_bottom_up(disj) return disj
def create_SPN2(): from spn.structure.Base import assign_ids from spn.structure.Base import rebuild_scopes_bottom_up from spn.algorithms.Validity import is_valid from spn.structure.leaves.parametric.Parametric import Categorical from spn.structure.Base import Sum, Product p0 = Product(children=[ Categorical(p=[0.3, 0.7], scope=1), Categorical(p=[0.4, 0.6], scope=2) ]) p1 = Product(children=[ Categorical(p=[0.5, 0.5], scope=1), Categorical(p=[0.6, 0.4], scope=2) ]) s1 = Sum(weights=[0.3, 0.7], children=[p0, p1]) p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1]) p3 = Product(children=[ Categorical(p=[0.2, 0.8], scope=0), Categorical(p=[0.3, 0.7], scope=1) ]) p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)]) spn = Sum(weights=[0.4, 0.6], children=[p2, p4]) assign_ids(spn) rebuild_scopes_bottom_up(spn) val, msg = is_valid(spn) assert val, msg return spn
def test_spn_to_str_and_back(self): self.check_obj_and_reconstruction( Categorical(p=[0.1, 0.2, 0.7], scope=0)) self.check_obj_and_reconstruction(Gaussian(mean=0, stdev=10, scope=0)) self.check_obj_and_reconstruction( Gaussian(mean=1.2, stdev=1.5, scope=0)) self.check_obj_and_reconstruction(Gaussian(mean=-1.2, stdev=1, scope=0)) gamma = Gamma(alpha=1, beta=2, scope=0) lnorm = LogNormal(mean=1, stdev=2, scope=0) self.check_obj_and_reconstruction(gamma) self.check_obj_and_reconstruction(lnorm) root = Sum(children=[gamma, lnorm], weights=[0.2, 0.8]) assign_ids(root) rebuild_scopes_bottom_up(root) self.check_obj_and_reconstruction(root) root = 0.3 * (Gaussian(mean=0, stdev=1, scope=0) * Gaussian( mean=1, stdev=1, scope=1)) + 0.7 * (Gaussian( mean=2, stdev=1, scope=0) * Gaussian(mean=3, stdev=1, scope=1)) self.check_obj_and_reconstruction(root)
def marginalize(node, scope): assert isinstance(scope, set), "scope must be a set" def marg_recursive(node): node_scope = set(node.scope) if node_scope.issubset(scope): return None if isinstance(node, Leaf): if len(node.scope) > 1: raise Exception('Leaf Node with |scope| > 1') return node newNode = node.__class__() #a sum node gets copied with all its children, or gets removed completely if isinstance(node, Sum): newNode.weights.extend(node.weights) for i, c in enumerate(node.children): newChildren = marg_recursive(c) if newChildren is None: continue newNode.children.append(newChildren) return newNode newNode = marg_recursive(node) rebuild_scopes_bottom_up(newNode) newNode = prune(newNode) assert is_valid(newNode) assign_ids(node) return newNode
def create_spflow_spn(n_feats): gaussians1 = [] gaussians2 = [] for i in range(n_feats): g1 = Gaussian(np.random.randn(), np.random.rand(), scope=i) g2 = Gaussian(np.random.randn(), np.random.rand(), scope=i) gaussians1.append(g1) gaussians2.append(g2) prods1 = [] prods2 = [] for i in range(0, n_feats, 2): p1 = Product([gaussians1[i], gaussians1[i + 1]]) p2 = Product([gaussians2[i], gaussians2[i + 1]]) prods1.append(p1) prods2.append(p2) sums = [] for i in range(n_feats // 2): s = Sum(weights=[0.5, 0.5], children=[prods1[i], prods2[i]]) sums.append(s) spflow_spn = Product(sums) assign_ids(spflow_spn) rebuild_scopes_bottom_up(spflow_spn) return spflow_spn
def create_spflow_spn(n_feats, ctype=Gaussian): children1 = [] children2 = [] for i in range(n_feats): if ctype == Gaussian: c1 = Gaussian(np.random.randn(), np.random.rand(), scope=i) c2 = Gaussian(np.random.randn(), np.random.rand(), scope=i) else: #c1 = Bernoulli(p=1.0, scope=i) #c2 = Bernoulli(p=1.0, scope=i) c1 = Bernoulli(p=np.random.rand(), scope=i) c2 = Bernoulli(p=np.random.rand(), scope=i) children1.append(c1) children2.append(c2) prods1 = [] prods2 = [] for i in range(0, n_feats, 2): p1 = Product([children1[i], children1[i + 1]]) p2 = Product([children2[i], children2[i + 1]]) prods1.append(p1) prods2.append(p2) sums = [] for i in range(n_feats // 2): s = Sum(weights=[0.5, 0.5], children=[prods1[i], prods2[i]]) sums.append(s) spflow_spn = Product(sums) assign_ids(spflow_spn) rebuild_scopes_bottom_up(spflow_spn) return spflow_spn
def test_sum(self): spn = Product() for s in range(7): spn.children.append(Leaf(scope=s)) assign_ids(spn) rebuild_scopes_bottom_up(spn) new_spn = SPN_Reshape(spn, 2) print(spn)
def create_conj(data, scope, alpha): conj = Product(children=[ Bernoulli(scope=[scope[k]], p=(data[0][k] * data.shape[0] + alpha) / (data.shape[0] + 2 * alpha)) for k in range(len(scope)) ]) assign_ids(conj) rebuild_scopes_bottom_up(conj) return conj
def _deserialize_model(self, model): rootID = model.rootNode featureType = model.featureType name = model.name if name == "": name = None rootNodes = self._binary_deserialize_graph(model.nodes) for root in rootNodes: rebuild_scopes_bottom_up(root) assert is_valid(root), "SPN invalid after deserialization" rootNode = next((root for root in rootNodes if root.id == rootID), None) if rootNode is None: logger.error(f"Did not find serialized root node {rootID}") return SPNModel(rootNode, featureType, name)
def create_naive_fact(data, scope, alpha): """ It returns a naive factorization of the data. Laplace's correction is not needed, but if not used may cause underflow. """ probs = (np.sum(data, axis=0) + alpha) / (data.shape[0] + 2 * alpha) naive_fact = Product(children=[ Bernoulli(p=probs[k], scope=[scope[k]]) for k in range(len(scope)) ]) assign_ids(naive_fact) rebuild_scopes_bottom_up(naive_fact) return naive_fact
def test_torch_vs_tf_time(self): # Create sample data from sklearn.datasets.samples_generator import make_blobs import tensorflow as tf from time import time X, y = make_blobs(n_samples=10, centers=3, n_features=2, random_state=0) X = X.astype(np.float32) # SPFLow implementation g00 = Gaussian(mean=0.0, stdev=1.0, scope=0) g10 = Gaussian(mean=1.0, stdev=2.0, scope=1) g01 = Gaussian(mean=3.0, stdev=2.0, scope=0) g11 = Gaussian(mean=5.0, stdev=1.0, scope=1) p0 = Product(children=[g00, g10]) p1 = Product(children=[g01, g11]) s = Sum(weights=[0.2, 0.8], children=[p0, p1]) assign_ids(s) rebuild_scopes_bottom_up(s) # Convert tf_spn, data_placeholder, variable_dict = spn_to_tf_graph(s, data=X) torch_spn = SumNode.from_spn(s) # Optimizer lr = 0.001 tf_optim = tf.train.AdamOptimizer(lr) torch_optim = optim.Adam(torch_spn.parameters(), lr) t0 = time() epochs = 10 optimize_tf_graph(tf_spn, variable_dict, data_placeholder, X, epochs=epochs, optimizer=tf_optim) t1 = time() optimize_torch(torch_spn, X, epochs=epochs, optimizer=torch_optim) t2 = time() print("Tensorflow took: ", t1 - t0) print("PyTorch took: ", t2 - t1)
def complete_layers(layer_nodes, current_node_type=Sum, depth=None): # all leaves should be at same depth root_layer = False if depth is None: root_layer = True depth = get_depth(layer_nodes[0]) if depth == 2: return children_layer = [] if current_node_type == Sum: for i in range(len(layer_nodes)): n = layer_nodes[i] assert isinstance(n, Sum) for j in range(len(n.children)): c = n.children[j] if not isinstance(c, Product): n.children[j] = Product([c]) children_layer.extend(n.children) children_layer_type = Product elif current_node_type == Product: for i in range(len(layer_nodes)): n = layer_nodes[i] assert isinstance(n, Product) for j in range(len(n.children)): c = n.children[j] if not isinstance(c, Sum): n.children[j] = Sum([1.0], [c]) children_layer.extend(n.children) children_layer_type = Sum else: raise Exception('node type' + str(current_node_type)) complete_layers(children_layer, current_node_type=children_layer_type, depth=depth - 1) if root_layer: rebuild_scopes_bottom_up(layer_nodes[0]) assign_ids(layer_nodes[0])
def __init__(self): p0 = Product(children=[ Categorical(p=[0.3, 0.7], scope=1), Categorical(p=[0.4, 0.6], scope=2) ]) p1 = Product(children=[ Categorical(p=[0.5, 0.5], scope=1), Categorical(p=[0.6, 0.4], scope=2) ]) s1 = Sum(weights=[0.3, 0.7], children=[p0, p1]) p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1]) p3 = Product(children=[ Categorical(p=[0.2, 0.8], scope=0), Categorical(p=[0.3, 0.7], scope=1) ]) p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)]) self.spn = Sum(weights=[0.4, 0.6], children=[p2, p4]) assign_ids(self.spn) rebuild_scopes_bottom_up(self.spn)
def test_equal_to_tf(self): # SPFLow implementation g00 = Gaussian(mean=0.0, stdev=1.0, scope=0) g10 = Gaussian(mean=1.0, stdev=2.0, scope=1) g01 = Gaussian(mean=3.0, stdev=2.0, scope=0) g11 = Gaussian(mean=5.0, stdev=1.0, scope=1) p0 = Product(children=[g00, g10]) p1 = Product(children=[g01, g11]) s = Sum(weights=[0.2, 0.8], children=[p0, p1]) assign_ids(s) rebuild_scopes_bottom_up(s) # Test for 100 random samples data = np.random.randn(100, 2) # LL from SPN ll = log_likelihood(s, data) # PyTorch implementation g00 = GaussianNode(mean=0.0, std=1.0, scope=0) g10 = GaussianNode(mean=1.0, std=2.0, scope=1) g01 = GaussianNode(mean=3.0, std=2.0, scope=0) g11 = GaussianNode(mean=5.0, std=1.0, scope=1) p0 = ProductNode(children=[g00, g10]) p1 = ProductNode(children=[g01, g11]) rootnode = SumNode(weights=[0.2, 0.8], children=[p0, p1]) datatensor = torch.Tensor(data) # LL from pytorch ll_torch = rootnode(datatensor) # Assert equality self.assertTrue( np.isclose(np.array(ll).squeeze(), ll_torch.detach().numpy(), atol=DELTA).all())
def _serialize_model(self, model): msg = spflow_capnp.Model.new_message() assert is_valid(model.root), "SPN invalid before serialization" # Assign (new) IDs to the nodes # Keep track of already assigned IDs, so the IDs are # unique for the whole file. assign_ids(model.root, self.assignedIDs) # Rebuild scopes bottom-up rebuild_scopes_bottom_up(model.root) msg.rootNode = model.root.id msg.numFeatures = len(model.root.scope) msg.featureType = model.featureType scope = msg.init("scope", len(model.root.scope)) for i,v in enumerate(model.root.scope): scope[i] = self._unwrap_value(v) name = "" if model.name is not None: name = model.name msg.name = name numNodes = get_number_of_nodes(model.root) nodes = msg.init("nodes", numNodes) nodeList = ListHandler(nodes) self._serialize_graph([model.root], nodeList) return msg
def getSpn2(): spn2 = Product(children=[Categorical(p=[0.5, 0.5], scope=0), Categorical(p=[0.2, 0.8], scope=2)]) assign_ids(spn2) rebuild_scopes_bottom_up(spn2) return spn2
type_to_param_map=pm_continuous_param_map, scope=[0], init_weights=b_lf_1_init_weights) b_lf_2_init_weights = {Gaussian: 0.3, Gamma: 0.7} # b_lf_2_init_weights = np.array([.3, .7]) b_fat_right_leaf_2, _priors = type_mixture_leaf_factory( leaf_type='pm', leaf_meta_type=MetaType.REAL, type_to_param_map=pm_continuous_param_map, scope=[1], init_weights=b_lf_2_init_weights) l_r_prod.children = [b_fat_right_leaf_1, b_fat_right_leaf_2] # # composing rebuild_scopes_bottom_up(root) assign_ids(root) print(root) print(spn_to_str_equation(root)) global_W = compute_global_type_weights(root) print('GLOBAL_W', global_W) global_W = compute_global_type_weights(root, aggr_type=True) print('GLOBAL_W', global_W) gw_map = compute_leaf_global_mix_weights(root) print('G MIX W', gw_map) part_map = compute_partition_id_map(root) print('PARTITION MAP', part_map)
def __learn_spmn_structure(self, remaining_vars_data, remaining_vars_scope, curr_information_set_scope, index): logging.info( f'start of new recursion in __learn_spmn_structure method of SPMN') logging.debug(f'remaining_vars_scope: {remaining_vars_scope}') logging.debug( f'curr_information_set_scope: {curr_information_set_scope}') # rest set is remaining variables excluding the variables in current information set rest_set_scope = [ var_scope for var_scope in remaining_vars_scope if var_scope not in curr_information_set_scope ] logging.debug(f'rest_set_scope: {rest_set_scope}') scope_index = sum([len(x) for x in self.params.partial_order[:index]]) next_scope_index = sum( [len(x) for x in self.params.partial_order[:index + 1]]) if remaining_vars_scope == curr_information_set_scope: # this is last information set in partial order. Base case of recursion # test if current information set is a decision node if self.params.partial_order[index][ 0] in self.params.decision_nodes: raise Exception( f'last information set of partial order either contains random ' f'and utility variables or just a utility variable. ' f'This contains decision variable: {self.params.partial_order[index][0]}' ) else: # contains just the random and utility variables logging.info( f'at last information set of this recursive call: {curr_information_set_scope}' ) ds_context_last_information_set = get_ds_context( remaining_vars_data, remaining_vars_scope, self.params) if self.params.util_to_bin: last_information_set_spn = learn_parametric( remaining_vars_data, ds_context_last_information_set, min_instances_slice=20, initial_scope=remaining_vars_scope) else: last_information_set_spn = learn_mspn_for_spmn( remaining_vars_data, ds_context_last_information_set, min_instances_slice=20, initial_scope=remaining_vars_scope) logging.info(f'created spn at last information set') return last_information_set_spn # test for decision node. test if current information set is a decision node elif self.params.partial_order[index][0] in self.params.decision_nodes: decision_node = self.params.partial_order[index][0] logging.info(f'Encountered Decision Node: {decision_node}') # cluster the data from remaining variables w.r.t values of decision node clusters_on_next_remaining_vars, dec_vals = split_on_decision_node( remaining_vars_data) decision_node_children_spns = [] index += 1 next_information_set_scope = np.array( range(next_scope_index, next_scope_index + len(self.params.partial_order[index]))).tolist() next_remaining_vars_scope = rest_set_scope self.set_next_operation('Any') logging.info(f'split clusters based on decision node values') for cluster_on_next_remaining_vars in clusters_on_next_remaining_vars: decision_node_children_spns.append( self.__learn_spmn_structure(cluster_on_next_remaining_vars, next_remaining_vars_scope, next_information_set_scope, index)) decision_node_spn_branch = Max( dec_idx=scope_index, dec_values=dec_vals, children=decision_node_children_spns, feature_name=decision_node) assign_ids(decision_node_spn_branch) rebuild_scopes_bottom_up(decision_node_spn_branch) logging.info(f'created decision node') return decision_node_spn_branch # testing for independence else: curr_op = self.get_curr_operation() logging.debug( f'curr_op at prod node (independence test): {curr_op}') if curr_op != 'Sum': # fails if correlated variable set found in previous recursive call. # Without this condition code keeps looping at this stage ds_context = get_ds_context(remaining_vars_data, remaining_vars_scope, self.params) split_cols = get_split_cols_RDC_py() data_slices_prod = split_cols(remaining_vars_data, ds_context, remaining_vars_scope) logging.debug( f'{len(data_slices_prod)} slices found at data_slices_prod: ' ) prod_children = [] next_remaining_vars_scope = [] independent_vars_scope = [] for correlated_var_set_cluster, correlated_var_set_scope, weight in data_slices_prod: if any(var_scope in correlated_var_set_scope for var_scope in rest_set_scope): next_remaining_vars_scope.extend( correlated_var_set_scope) else: # this variable set of current information set is # not correlated to any variable in the rest set logging.info( f'independent variable set found: {correlated_var_set_scope}' ) ds_context_prod = get_ds_context( correlated_var_set_cluster, correlated_var_set_scope, self.params) if self.params.util_to_bin: independent_var_set_prod_child = learn_parametric( correlated_var_set_cluster, ds_context_prod, min_instances_slice=20, initial_scope=correlated_var_set_scope) else: independent_var_set_prod_child = learn_mspn_for_spmn( correlated_var_set_cluster, ds_context_prod, min_instances_slice=20, initial_scope=correlated_var_set_scope) independent_vars_scope.extend(correlated_var_set_scope) prod_children.append(independent_var_set_prod_child) logging.info( f'correlated variables over entire remaining variables ' f'at prod, passed for next recursion: ' f'{next_remaining_vars_scope}') # check if all variables in current information set are consumed if all(var_scope in independent_vars_scope for var_scope in curr_information_set_scope): index += 1 next_information_set_scope = np.array( range( next_scope_index, next_scope_index + len(self.params.partial_order[index]))).tolist() # since current information set is totally consumed next_remaining_vars_scope = rest_set_scope else: # some variables in current information set still remain index = index next_information_set_scope = set( curr_information_set_scope) - set( independent_vars_scope) next_remaining_vars_scope = next_information_set_scope | set( rest_set_scope) # convert unordered sets of scope to sorted lists to keep in sync with partial order next_information_set_scope = sorted( list(next_information_set_scope)) next_remaining_vars_scope = sorted( list(next_remaining_vars_scope)) self.set_next_operation('Sum') next_remaining_vars_data = column_slice_data_by_scope( remaining_vars_data, remaining_vars_scope, next_remaining_vars_scope) logging.info( f'independence test completed for current information set {curr_information_set_scope} ' f'and rest set {rest_set_scope} ') remaining_vars_prod_child = self.__learn_spmn_structure( next_remaining_vars_data, next_remaining_vars_scope, next_information_set_scope, index) prod_children.append(remaining_vars_prod_child) product_node = Product(children=prod_children) assign_ids(product_node) rebuild_scopes_bottom_up(product_node) logging.info(f'created product node') return product_node # Cluster the data else: curr_op = self.get_curr_operation() logging.debug(f'curr_op at sum node (cluster test): {curr_op}') split_rows = get_split_rows_KMeans() # from SPMNHelper.py if self.cluster_by_curr_information_set: curr_information_set_data = column_slice_data_by_scope( remaining_vars_data, remaining_vars_scope, curr_information_set_scope) ds_context_sum = get_ds_context( curr_information_set_data, curr_information_set_scope, self.params) data_slices_sum, km_model = split_rows( curr_information_set_data, ds_context_sum, curr_information_set_scope) logging.info( f'split clusters based on current information set {curr_information_set_scope}' ) else: # cluster on whole remaining variables ds_context_sum = get_ds_context(remaining_vars_data, remaining_vars_scope, self.params) data_slices_sum, km_model = split_rows( remaining_vars_data, ds_context_sum, remaining_vars_scope) logging.info( f'split clusters based on whole remaining variables {remaining_vars_scope}' ) sum_node_children = [] weights = [] index = index logging.debug( f'{len(data_slices_sum)} clusters found at data_slices_sum' ) cluster_num = 0 labels_array = km_model.labels_ logging.debug( f'cluster labels of rows: {labels_array} used to cluster data on ' f'total remaining variables {remaining_vars_scope}') for cluster, scope, weight in data_slices_sum: self.set_next_operation("Prod") # cluster whole remaining variables based on clusters formed. # below methods are useful if clusters were formed on just the current information set cluster_indices = get_row_indices_of_cluster( labels_array, cluster_num) cluster_on_remaining_vars = row_slice_data_by_indices( remaining_vars_data, cluster_indices) # logging.debug(np.array_equal(cluster_on_remaining_vars, cluster )) sum_node_children.append( self.__learn_spmn_structure( cluster_on_remaining_vars, remaining_vars_scope, curr_information_set_scope, index)) weights.append(weight) cluster_num += 1 sum_node = Sum(weights=weights, children=sum_node_children) assign_ids(sum_node) rebuild_scopes_bottom_up(sum_node) logging.info(f'created sum node') return sum_node
# # RANDOM STRUCTURE LEARNING learn_start_t = perf_counter() spn = learn_rand_spn(data, ds_context, min_instances_slice=args.min_instances, row_a=args.beta_rows[0], row_b=args.beta_rows[1], col_a=args.beta_cols[0], col_b=args.beta_cols[1], col_threshold=args.col_split_threshold, memory=None, rand_gen=rand_gen) rebuild_scopes_bottom_up(spn) assign_ids(spn) learn_end_t = perf_counter() stats = get_structure_stats_dict(spn) logging.info('\n\nLearned spn in {} with stats:\n\t{}'.format( learn_end_t - learn_start_t, stats)) print(spn_to_str_equation(spn)) print(spn.scope) # # storing the spn on file spn_output_path = os.path.join(out_path, 'spn.model.pkl') store_start_t = perf_counter() with open(spn_output_path, 'wb') as f:
def learn_spmn_structure(train_data, index, scope_index, params): train_data = train_data curr_var_set = params.partial_order[index] if params.partial_order[index][0] in params.decision_nodes: decision_node = params.partial_order[index][0] cl, dec_vals= split_on_decision_node(train_data, curr_var_set) spn0 = [] index= index+1 set_next_operation("None") for c in cl: if index < len(params.partial_order): spn0.append(learn_spmn_structure(c, index, scope_index, params)) spn = Max(dec_values=dec_vals, children=spn0, feature_name=decision_node) else: spn = Max(dec_values=dec_vals, children=None, feature_name=decision_node) assign_ids(spn) rebuild_scopes_bottom_up(spn) return spn else: curr_train_data_prod, curr_train_data = get_curr_train_data_prod(train_data, curr_var_set) split_cols = get_split_cols_RDC_py() scope_prod = get_scope_prod(curr_train_data_prod, scope_index, params.feature_names) ds_context_prod = get_ds_context_prod(curr_train_data_prod, scope_prod, index, scope_index, params) data_slices_prod = split_cols(curr_train_data_prod, ds_context_prod, scope_prod) curr_op = get_next_operation() if len(data_slices_prod)>1 or curr_op == "Prod" or index == len(params.partial_order) : set_next_operation("Sum") if params.util_to_bin : spn0 = learn_parametric(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope= scope_prod) else: spn0 = learn_mspn(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope=scope_prod) index = index + 1 scope_index = scope_index +curr_train_data_prod.shape[1] if index < len(params.partial_order): spn1 = learn_spmn_structure(curr_train_data, index, scope_index, params) spn = Product(children=[spn0, spn1]) assign_ids(spn) rebuild_scopes_bottom_up(spn) else: spn = spn0 assign_ids(spn) rebuild_scopes_bottom_up(spn) else: split_rows = get_split_rows_KMeans() scope_sum = list(range(train_data.shape[1])) ds_context_sum = get_ds_context_sum(train_data, scope_sum, index, scope_index, params) data_slices_sum = split_rows(train_data, ds_context_sum, scope_sum) spn0 = [] weights = [] index = index if index < len(params.partial_order): for cl, scop, weight in data_slices_sum: set_next_operation("Prod") spn0.append(learn_spmn_structure(cl, index, scope_index, params)) weights.append(weight) spn = Sum(weights=weights, children=spn0) assign_ids(spn) rebuild_scopes_bottom_up(spn) assign_ids(spn) rebuild_scopes_bottom_up(spn) return spn
def build_xpc_bottom_up(data, part_root, dtree_dict, det_level, leaves, alpha): """ Build the XPC induced by the partitions tree in a bottom up way. The building process is based on the post-order traversal exploration of the partitions tree :param alpha: smoothing factor :param det_level: 0 for non det., 1 for relaxed det. and 2 for det. :param dtree_dict: None if no dependency tree has to be respected, a dictionary of dtree otherwise :param part_root: A random partitions tree :param data: The data to model :param leaves: multivariate leaf function :return: the XPC induced by the partition tree """ partitions_stack = [part_root] pc_nodes_stack = [] last_part_exp = None while partitions_stack: part = partitions_stack[-1] if not part.is_partitioned() or (last_part_exp in part.get_sub_partitions()): if part.is_partitioned(): pc_child_nodes = pc_nodes_stack[-len(part.get_sub_partitions() ):] pc_nodes_stack = pc_nodes_stack[:len(pc_nodes_stack) - len(part.get_sub_partitions())] if part.is_horizontally_partitioned: weights = [ len(sub_part.row_ids) / len(part.row_ids) for sub_part in part.get_sub_partitions() ] pc_sum_node = Sum(weights=weights, children=pc_child_nodes) pc_nodes_stack.append(pc_sum_node) else: pc_child_nodes_ = [] for c in pc_child_nodes: if isinstance(c, Product) or (isinstance(c, Sum) and len(c.children) == 1): pc_child_nodes_.extend(c.children) else: pc_child_nodes_.append(c) pc_prod_node = Product(children=pc_child_nodes_) pc_nodes_stack.append(pc_prod_node) else: leaf_pc = create_leaf_pc(data, part, leaves, dtree_dict, det_level, alpha) pc_nodes_stack.append(leaf_pc) last_part_exp = partitions_stack.pop() else: partitions_stack.extend(part.get_sub_partitions()[::-1]) xpc = pc_nodes_stack[0] assign_ids(xpc) rebuild_scopes_bottom_up(xpc) return xpc
def create_expc(data, ensemble_dim, sd_level, det_level, min_part_inst, conj_len, arity, leaves, alpha=0.01, bagging=False, max_parts=1000, random_seed=42): if sd_level not in SD_LEVELS: raise StructDecError() if det_level not in DET_LEVELS: raise DetError() if arity < 2 or arity > 2**conj_len: raise ArityError() if sd_level == SD_LEVEL_2 and conj_len == 1: raise NoRandomness() print('Generating random partitionings..') np.random.seed(random_seed) str_dec = (sd_level == SD_LEVEL_1 or sd_level == SD_LEVEL_2) if sd_level == SD_LEVEL_2: uncond_vars = greedy_vars_ordering(data, conj_len) else: uncond_vars = np.arange(data.shape[1]).tolist() conj_vars_l = [None] * ensemble_dim cl_parts_l = [None] * ensemble_dim n_parts_l = [None] * ensemble_dim ptrees_l = [None] * ensemble_dim data_l = [None] * ensemble_dim xpc_l = [None] * ensemble_dim for i in range(ensemble_dim): if sd_level != SD_LEVEL_2: np.random.shuffle(uncond_vars) if bagging: data_l[i] = data[np.random.choice(a=data.shape[0], size=data.shape[0] * 70 // 100, replace=True)] else: data_l[i] = data print(uncond_vars) ptrees_l[i], cl_parts_l[i], conj_vars_l[i], n_parts_l[i] = \ create_random_partitioning(data=data_l[i], str_dec=str_dec, min_part_inst=min_part_inst, conj_len=conj_len, arity=arity, max_parts=max_parts, uncond_vars=uncond_vars) if all([n_parts == 1 for n_parts in n_parts_l]): raise NoPartitioningFound() if sd_level == SD_LEVEL_0 or leaves == create_naive_fact: dtree_dict = None for i in range(ensemble_dim): print('Learning XPC_%s/%s' % (i, ensemble_dim)) xpc_l[i] = build_xpc_bottom_up(data_l[i], ptrees_l[i], dtree_dict, det_level, leaves, alpha) elif sd_level == SD_LEVEL_1: for i in range(ensemble_dim): print('Learning XPC_%s/%s' % (i, ensemble_dim)) # # learn a dtree for each XPC dtree_dict = create_dtree_dict([data_l[i]], [cl_parts_l[i]], conj_vars_l[i], alpha) xpc_l[i] = build_xpc_bottom_up(data_l[i], ptrees_l[i], dtree_dict, det_level, leaves, alpha) elif sd_level == SD_LEVEL_2: # # learn a dtree for the ensemble print('Learning a dependency tree for the ensemble..') dtree_dict = create_dtree_dict(data_l, cl_parts_l, max(conj_vars_l, key=len), alpha) for i in range(ensemble_dim): print('Building XPC_%s/%s' % (i, ensemble_dim)) xpc_l[i] = build_xpc_bottom_up(data_l[i], ptrees_l[i], dtree_dict, det_level, leaves, alpha) expc = Sum(weights=1 / ensemble_dim * np.ones(ensemble_dim), children=xpc_l) assign_ids(expc) rebuild_scopes_bottom_up(expc) return expc, n_parts_l
def str_to_spn(text, features=None, str_to_spn_lambdas=_str_to_spn): from lark import Lark ext_name = "\n".join(map(lambda s: " | " + s, str_to_spn_lambdas.keys())) ext_grammar = "\n".join([s for _, s, _ in str_to_spn_lambdas.values()]) grammar = r""" %import common.DECIMAL -> DECIMAL %import common.WS %ignore WS %import common.WORD -> WORD %import common.DIGIT -> DIGIT ALPHANUM: "a".."z"|"A".."Z"|DIGIT PARAMCHARS: ALPHANUM|"_" FNAME: ALPHANUM+ PARAMNAME: PARAMCHARS+ NUMBER: DIGIT|DECIMAL NUMBERS: NUMBER+ list: "[" [NUMBERS ("," NUMBERS)*] "]" ?node: prodnode | sumnode """ + ext_name + r""" prodnode: "(" [node ("*" node)*] ")" sumnode: "(" [NUMBERS "*" node ("+" NUMBERS "*" node)*] ")" """ + ext_grammar parser = Lark(grammar, start='node') # print(grammar) tree = parser.parse(text) def tree_to_spn(tree, features): tnode = tree.data if tnode == "sumnode": node = Sum() for i in range(int(len(tree.children) / 2)): j = 2 * i w, c = tree.children[j], tree.children[j + 1] node.weights.append(float(w)) node.children.append(tree_to_spn(c, features)) return node if tnode == "prodnode": if len(tree.children) == 1: return tree_to_spn(tree.children[0], features) node = Product() for c in tree.children: node.children.append(tree_to_spn(c, features)) return node if tnode in str_to_spn_lambdas: return str_to_spn_lambdas[tnode][0](tree, features, str_to_spn_lambdas[tnode][2], tree_to_spn) raise Exception('Node type not registered: ' + tnode) spn = tree_to_spn(tree, features) assign_ids(spn) rebuild_scopes_bottom_up(spn) valid, err = is_valid(spn) assert valid, err assign_ids(spn) return spn