def get_splitting_functions(cols, rows, ohe, threshold, rand_gen, n_jobs): from spn.algorithms.splitting.Clustering import get_split_rows_KMeans, get_split_rows_TSNE, get_split_rows_GMM from spn.algorithms.splitting.PoissonStabilityTest import get_split_cols_poisson_py from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py if isinstance(cols, str): if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=n_jobs) elif cols == "poisson": split_cols = get_split_cols_poisson_py(threshold, n_jobs=n_jobs) else: raise AssertionError("unknown columns splitting strategy type %s" % str(cols)) else: split_cols = cols if isinstance(rows, str): if rows == "rdc": split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=n_jobs) elif rows == "kmeans": split_rows = get_split_rows_KMeans() elif rows == "tsne": split_rows = get_split_rows_TSNE() elif rows == "gmm": split_rows = get_split_rows_GMM() else: raise AssertionError("unknown rows splitting strategy type %s" % str(rows)) else: split_rows = rows return split_cols, split_rows
def learn(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe): split_cols = None if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) if rows == "kmeans": split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn_param(data, ds_context, cols, rows, min_instances_slice, threshold, ohe): if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) if rows == "rdc": split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) elif rows == "kmeans": split_rows = get_split_rows_KMeans() nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def l_mspn_missing(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe): if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) if rows == "rdc": split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) elif rows == "kmeans": split_rows = get_split_rows_KMeans() if leaves is None: leaves = create_histogram_leaf nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn(data, ds_context, min_instances_slice, threshold, linear, ohe, rand_gen=None): if rand_gen is None: rand_gen = np.random.RandomState(17) ds_context.rand_gen = rand_gen # # FIXME: adopt the python version of RDC, allowing to deal with missing values # split_cols = get_split_cols_RDC(threshold, ohe, linear) split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) # get_split_rows_RDC(n_clusters=2, k=10, s=1 / 6, ohe=True, seed=rand_gen) leaves = create_type_leaf nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def build_spn(numpy_data, feature_types, spn_params, rand_gen): from spn.algorithms.StructureLearning import get_next_operation, learn_structure from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py from spn.structure.leaves.parametric.Parametric import Categorical from spn.structure.leaves.piecewise.PiecewiseLinear import create_piecewise_leaf from spn.experiments.AQP.leaves.identity.IdentityNumeric import create_identity_leaf #cast may not be necessary numpy_data = np.array(numpy_data, np.float64) #Generate meta_type array meta_types = [] for feature_type in feature_types: if feature_type == "discrete": meta_types.append(MetaType.DISCRETE) elif feature_type == "continuous": meta_types.append(MetaType.REAL) else: raise Exception("Unknown feature type for SPN: " + feature_type) #Create information about the domains domains = [] for col in range(numpy_data.shape[1]): feature_type = feature_types[col] if feature_type == 'continuous': domains.append([np.min(numpy_data[:, col]), np.max(numpy_data[:, col])]) elif feature_type in {'discrete', 'categorical'}: domains.append(np.unique(numpy_data[:, col])) #Create context ds_context = Context(meta_types=meta_types, domains=domains) #Fixed parameters rdc_threshold = spn_params["rdc_threshold"] cols = spn_params["cols"] rows = spn_params["rows"] min_instances_slice = spn_params["min_instances_slice"] ohe = spn_params["ohe"] prior_weight = spn_params["prior_weight"] identity_numeric = spn_params["identity_numeric"] #Method to create leaves in the SPN def create_leaf(data, ds_context, scope): idx = scope[0] meta_type = ds_context.meta_types[idx] if meta_type == MetaType.REAL: if identity_numeric: return create_identity_leaf(data, scope) if prior_weight == 0.: return create_piecewise_leaf(data, ds_context, scope, prior_weight=None) else: return create_piecewise_leaf(data, ds_context, scope, prior_weight=prior_weight) elif meta_type == MetaType.DISCRETE: unique, counts = np.unique(data[:,0], return_counts=True) sorted_counts = np.zeros(len(ds_context.domains[idx]), dtype=np.float64) for i, x in enumerate(unique): sorted_counts[int(x)] = counts[i] p = sorted_counts / data.shape[0] #Do regularization if prior_weight > 0.: p += prior_weight p = p/np.sum(p) return Categorical(p, scope) else: raise Exception("Mehtod learn_mspn_for_aqp(...) cannot create leaf for " + str(meta_type)) #Set method to create leaves leaves = create_leaf #Set methods to cluster and to do the independence test if cols == "rdc": #split_cols = get_split_cols_RDC(rdc_threshold, ohe=ohe, linear=True) split_cols = get_split_cols_RDC_py(rdc_threshold, ohe=ohe, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) if rows == "rdc": #split_rows = get_split_rows_RDC(ohe=ohe) split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=ohe, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) #This choses which operation is performed nextop = get_next_operation(min_instances_slice) #Learn the SPN root_node = learn_structure(numpy_data, ds_context, split_rows, split_cols, leaves, nextop) return root_node