def get_splitting_functions(cols, rows, ohe, threshold, rand_gen, n_jobs): from spn.algorithms.splitting.Clustering import get_split_rows_KMeans, get_split_rows_TSNE, get_split_rows_GMM from spn.algorithms.splitting.PoissonStabilityTest import get_split_cols_poisson_py from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py if isinstance(cols, str): if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=n_jobs) elif cols == "poisson": split_cols = get_split_cols_poisson_py(threshold, n_jobs=n_jobs) else: raise AssertionError("unknown columns splitting strategy type %s" % str(cols)) else: split_cols = cols if isinstance(rows, str): if rows == "rdc": split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=n_jobs) elif rows == "kmeans": split_rows = get_split_rows_KMeans() elif rows == "tsne": split_rows = get_split_rows_TSNE() elif rows == "gmm": split_rows = get_split_rows_GMM() else: raise AssertionError("unknown rows splitting strategy type %s" % str(rows)) else: split_rows = rows return split_cols, split_rows
def learn(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe): split_cols = None if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) if rows == "kmeans": split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn_param(data, ds_context, cols, rows, min_instances_slice, threshold, ohe): if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) if rows == "rdc": split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) elif rows == "kmeans": split_rows = get_split_rows_KMeans() nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def l_mspn_missing(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe): if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) if rows == "rdc": split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) elif rows == "kmeans": split_rows = get_split_rows_KMeans() if leaves is None: leaves = create_histogram_leaf nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn(data, ds_context, min_instances_slice, threshold, linear, ohe, rand_gen=None): if rand_gen is None: rand_gen = np.random.RandomState(17) ds_context.rand_gen = rand_gen # # FIXME: adopt the python version of RDC, allowing to deal with missing values # split_cols = get_split_cols_RDC(threshold, ohe, linear) split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) # get_split_rows_RDC(n_clusters=2, k=10, s=1 / 6, ohe=True, seed=rand_gen) leaves = create_type_leaf nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn_spmn_structure(train_data, index, scope_index, params): train_data = train_data curr_var_set = params.partial_order[index] if params.partial_order[index][0] in params.decision_nodes: decision_node = params.partial_order[index][0] cl, dec_vals= split_on_decision_node(train_data, curr_var_set) spn0 = [] index= index+1 set_next_operation("None") for c in cl: if index < len(params.partial_order): spn0.append(learn_spmn_structure(c, index, scope_index, params)) spn = Max(dec_values=dec_vals, children=spn0, feature_name=decision_node) else: spn = Max(dec_values=dec_vals, children=None, feature_name=decision_node) assign_ids(spn) rebuild_scopes_bottom_up(spn) return spn else: curr_train_data_prod, curr_train_data = get_curr_train_data_prod(train_data, curr_var_set) split_cols = get_split_cols_RDC_py() scope_prod = get_scope_prod(curr_train_data_prod, scope_index, params.feature_names) ds_context_prod = get_ds_context_prod(curr_train_data_prod, scope_prod, index, scope_index, params) data_slices_prod = split_cols(curr_train_data_prod, ds_context_prod, scope_prod) curr_op = get_next_operation() if len(data_slices_prod)>1 or curr_op == "Prod" or index == len(params.partial_order) : set_next_operation("Sum") if params.util_to_bin : spn0 = learn_parametric(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope= scope_prod) else: spn0 = learn_mspn(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope=scope_prod) index = index + 1 scope_index = scope_index +curr_train_data_prod.shape[1] if index < len(params.partial_order): spn1 = learn_spmn_structure(curr_train_data, index, scope_index, params) spn = Product(children=[spn0, spn1]) assign_ids(spn) rebuild_scopes_bottom_up(spn) else: spn = spn0 assign_ids(spn) rebuild_scopes_bottom_up(spn) else: split_rows = get_split_rows_KMeans() scope_sum = list(range(train_data.shape[1])) ds_context_sum = get_ds_context_sum(train_data, scope_sum, index, scope_index, params) data_slices_sum = split_rows(train_data, ds_context_sum, scope_sum) spn0 = [] weights = [] index = index if index < len(params.partial_order): for cl, scop, weight in data_slices_sum: set_next_operation("Prod") spn0.append(learn_spmn_structure(cl, index, scope_index, params)) weights.append(weight) spn = Sum(weights=weights, children=spn0) assign_ids(spn) rebuild_scopes_bottom_up(spn) assign_ids(spn) rebuild_scopes_bottom_up(spn) return spn
# # if compress: # node = Compress(node) # if prune: # node = Prune(node) # # if validate: # valid, err = is_valid(node) # assert valid, "invalid spn: " + err return node if __name__ == "__main__": train_data = np.c_[np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))], np.r_[np.zeros((500, 1)), np.ones((500, 1))], ] spn = learn_structure( train_data, Context(parametric_types=[Gaussian, Gaussian, Categorical ]).add_domains(train_data), scope=list(range(train_data.shape[1])), split_rows=get_split_rows_KMeans(), split_cols=get_split_cols_RDC_py(), create_leaf=create_parametric_leaf, ) from spn.io.plot import TreeVisualization TreeVisualization.plot_spn(spn, file_name="tree_spn2.png")
def build_spn(numpy_data, feature_types, spn_params, rand_gen): from spn.algorithms.StructureLearning import get_next_operation, learn_structure from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py from spn.structure.leaves.parametric.Parametric import Categorical from spn.structure.leaves.piecewise.PiecewiseLinear import create_piecewise_leaf from spn.experiments.AQP.leaves.identity.IdentityNumeric import create_identity_leaf #cast may not be necessary numpy_data = np.array(numpy_data, np.float64) #Generate meta_type array meta_types = [] for feature_type in feature_types: if feature_type == "discrete": meta_types.append(MetaType.DISCRETE) elif feature_type == "continuous": meta_types.append(MetaType.REAL) else: raise Exception("Unknown feature type for SPN: " + feature_type) #Create information about the domains domains = [] for col in range(numpy_data.shape[1]): feature_type = feature_types[col] if feature_type == 'continuous': domains.append([np.min(numpy_data[:, col]), np.max(numpy_data[:, col])]) elif feature_type in {'discrete', 'categorical'}: domains.append(np.unique(numpy_data[:, col])) #Create context ds_context = Context(meta_types=meta_types, domains=domains) #Fixed parameters rdc_threshold = spn_params["rdc_threshold"] cols = spn_params["cols"] rows = spn_params["rows"] min_instances_slice = spn_params["min_instances_slice"] ohe = spn_params["ohe"] prior_weight = spn_params["prior_weight"] identity_numeric = spn_params["identity_numeric"] #Method to create leaves in the SPN def create_leaf(data, ds_context, scope): idx = scope[0] meta_type = ds_context.meta_types[idx] if meta_type == MetaType.REAL: if identity_numeric: return create_identity_leaf(data, scope) if prior_weight == 0.: return create_piecewise_leaf(data, ds_context, scope, prior_weight=None) else: return create_piecewise_leaf(data, ds_context, scope, prior_weight=prior_weight) elif meta_type == MetaType.DISCRETE: unique, counts = np.unique(data[:,0], return_counts=True) sorted_counts = np.zeros(len(ds_context.domains[idx]), dtype=np.float64) for i, x in enumerate(unique): sorted_counts[int(x)] = counts[i] p = sorted_counts / data.shape[0] #Do regularization if prior_weight > 0.: p += prior_weight p = p/np.sum(p) return Categorical(p, scope) else: raise Exception("Mehtod learn_mspn_for_aqp(...) cannot create leaf for " + str(meta_type)) #Set method to create leaves leaves = create_leaf #Set methods to cluster and to do the independence test if cols == "rdc": #split_cols = get_split_cols_RDC(rdc_threshold, ohe=ohe, linear=True) split_cols = get_split_cols_RDC_py(rdc_threshold, ohe=ohe, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) if rows == "rdc": #split_rows = get_split_rows_RDC(ohe=ohe) split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=ohe, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) #This choses which operation is performed nextop = get_next_operation(min_instances_slice) #Learn the SPN root_node = learn_structure(numpy_data, ds_context, split_rows, split_cols, leaves, nextop) return root_node
def __learn_spmn_structure(self, remaining_vars_data, remaining_vars_scope, curr_information_set_scope, index): logging.info( f'start of new recursion in __learn_spmn_structure method of SPMN') logging.debug(f'remaining_vars_scope: {remaining_vars_scope}') logging.debug( f'curr_information_set_scope: {curr_information_set_scope}') # rest set is remaining variables excluding the variables in current information set rest_set_scope = [ var_scope for var_scope in remaining_vars_scope if var_scope not in curr_information_set_scope ] logging.debug(f'rest_set_scope: {rest_set_scope}') scope_index = sum([len(x) for x in self.params.partial_order[:index]]) next_scope_index = sum( [len(x) for x in self.params.partial_order[:index + 1]]) if remaining_vars_scope == curr_information_set_scope: # this is last information set in partial order. Base case of recursion # test if current information set is a decision node if self.params.partial_order[index][ 0] in self.params.decision_nodes: raise Exception( f'last information set of partial order either contains random ' f'and utility variables or just a utility variable. ' f'This contains decision variable: {self.params.partial_order[index][0]}' ) else: # contains just the random and utility variables logging.info( f'at last information set of this recursive call: {curr_information_set_scope}' ) ds_context_last_information_set = get_ds_context( remaining_vars_data, remaining_vars_scope, self.params) if self.params.util_to_bin: last_information_set_spn = learn_parametric( remaining_vars_data, ds_context_last_information_set, min_instances_slice=20, initial_scope=remaining_vars_scope) else: last_information_set_spn = learn_mspn_for_spmn( remaining_vars_data, ds_context_last_information_set, min_instances_slice=20, initial_scope=remaining_vars_scope) logging.info(f'created spn at last information set') return last_information_set_spn # test for decision node. test if current information set is a decision node elif self.params.partial_order[index][0] in self.params.decision_nodes: decision_node = self.params.partial_order[index][0] logging.info(f'Encountered Decision Node: {decision_node}') # cluster the data from remaining variables w.r.t values of decision node clusters_on_next_remaining_vars, dec_vals = split_on_decision_node( remaining_vars_data) decision_node_children_spns = [] index += 1 next_information_set_scope = np.array( range(next_scope_index, next_scope_index + len(self.params.partial_order[index]))).tolist() next_remaining_vars_scope = rest_set_scope self.set_next_operation('Any') logging.info(f'split clusters based on decision node values') for cluster_on_next_remaining_vars in clusters_on_next_remaining_vars: decision_node_children_spns.append( self.__learn_spmn_structure(cluster_on_next_remaining_vars, next_remaining_vars_scope, next_information_set_scope, index)) decision_node_spn_branch = Max( dec_idx=scope_index, dec_values=dec_vals, children=decision_node_children_spns, feature_name=decision_node) assign_ids(decision_node_spn_branch) rebuild_scopes_bottom_up(decision_node_spn_branch) logging.info(f'created decision node') return decision_node_spn_branch # testing for independence else: curr_op = self.get_curr_operation() logging.debug( f'curr_op at prod node (independence test): {curr_op}') if curr_op != 'Sum': # fails if correlated variable set found in previous recursive call. # Without this condition code keeps looping at this stage ds_context = get_ds_context(remaining_vars_data, remaining_vars_scope, self.params) split_cols = get_split_cols_RDC_py() data_slices_prod = split_cols(remaining_vars_data, ds_context, remaining_vars_scope) logging.debug( f'{len(data_slices_prod)} slices found at data_slices_prod: ' ) prod_children = [] next_remaining_vars_scope = [] independent_vars_scope = [] for correlated_var_set_cluster, correlated_var_set_scope, weight in data_slices_prod: if any(var_scope in correlated_var_set_scope for var_scope in rest_set_scope): next_remaining_vars_scope.extend( correlated_var_set_scope) else: # this variable set of current information set is # not correlated to any variable in the rest set logging.info( f'independent variable set found: {correlated_var_set_scope}' ) ds_context_prod = get_ds_context( correlated_var_set_cluster, correlated_var_set_scope, self.params) if self.params.util_to_bin: independent_var_set_prod_child = learn_parametric( correlated_var_set_cluster, ds_context_prod, min_instances_slice=20, initial_scope=correlated_var_set_scope) else: independent_var_set_prod_child = learn_mspn_for_spmn( correlated_var_set_cluster, ds_context_prod, min_instances_slice=20, initial_scope=correlated_var_set_scope) independent_vars_scope.extend(correlated_var_set_scope) prod_children.append(independent_var_set_prod_child) logging.info( f'correlated variables over entire remaining variables ' f'at prod, passed for next recursion: ' f'{next_remaining_vars_scope}') # check if all variables in current information set are consumed if all(var_scope in independent_vars_scope for var_scope in curr_information_set_scope): index += 1 next_information_set_scope = np.array( range( next_scope_index, next_scope_index + len(self.params.partial_order[index]))).tolist() # since current information set is totally consumed next_remaining_vars_scope = rest_set_scope else: # some variables in current information set still remain index = index next_information_set_scope = set( curr_information_set_scope) - set( independent_vars_scope) next_remaining_vars_scope = next_information_set_scope | set( rest_set_scope) # convert unordered sets of scope to sorted lists to keep in sync with partial order next_information_set_scope = sorted( list(next_information_set_scope)) next_remaining_vars_scope = sorted( list(next_remaining_vars_scope)) self.set_next_operation('Sum') next_remaining_vars_data = column_slice_data_by_scope( remaining_vars_data, remaining_vars_scope, next_remaining_vars_scope) logging.info( f'independence test completed for current information set {curr_information_set_scope} ' f'and rest set {rest_set_scope} ') remaining_vars_prod_child = self.__learn_spmn_structure( next_remaining_vars_data, next_remaining_vars_scope, next_information_set_scope, index) prod_children.append(remaining_vars_prod_child) product_node = Product(children=prod_children) assign_ids(product_node) rebuild_scopes_bottom_up(product_node) logging.info(f'created product node') return product_node # Cluster the data else: curr_op = self.get_curr_operation() logging.debug(f'curr_op at sum node (cluster test): {curr_op}') split_rows = get_split_rows_KMeans() # from SPMNHelper.py if self.cluster_by_curr_information_set: curr_information_set_data = column_slice_data_by_scope( remaining_vars_data, remaining_vars_scope, curr_information_set_scope) ds_context_sum = get_ds_context( curr_information_set_data, curr_information_set_scope, self.params) data_slices_sum, km_model = split_rows( curr_information_set_data, ds_context_sum, curr_information_set_scope) logging.info( f'split clusters based on current information set {curr_information_set_scope}' ) else: # cluster on whole remaining variables ds_context_sum = get_ds_context(remaining_vars_data, remaining_vars_scope, self.params) data_slices_sum, km_model = split_rows( remaining_vars_data, ds_context_sum, remaining_vars_scope) logging.info( f'split clusters based on whole remaining variables {remaining_vars_scope}' ) sum_node_children = [] weights = [] index = index logging.debug( f'{len(data_slices_sum)} clusters found at data_slices_sum' ) cluster_num = 0 labels_array = km_model.labels_ logging.debug( f'cluster labels of rows: {labels_array} used to cluster data on ' f'total remaining variables {remaining_vars_scope}') for cluster, scope, weight in data_slices_sum: self.set_next_operation("Prod") # cluster whole remaining variables based on clusters formed. # below methods are useful if clusters were formed on just the current information set cluster_indices = get_row_indices_of_cluster( labels_array, cluster_num) cluster_on_remaining_vars = row_slice_data_by_indices( remaining_vars_data, cluster_indices) # logging.debug(np.array_equal(cluster_on_remaining_vars, cluster )) sum_node_children.append( self.__learn_spmn_structure( cluster_on_remaining_vars, remaining_vars_scope, curr_information_set_scope, index)) weights.append(weight) cluster_num += 1 sum_node = Sum(weights=weights, children=sum_node_children) assign_ids(sum_node) rebuild_scopes_bottom_up(sum_node) logging.info(f'created sum node') return sum_node