示例#1
0
def get_splitting_functions(cols, rows, ohe, threshold, rand_gen, n_jobs):
    from spn.algorithms.splitting.Clustering import get_split_rows_KMeans, get_split_rows_TSNE, get_split_rows_GMM
    from spn.algorithms.splitting.PoissonStabilityTest import get_split_cols_poisson_py
    from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py

    if isinstance(cols, str):
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=n_jobs)
        elif cols == "poisson":
            split_cols = get_split_cols_poisson_py(threshold, n_jobs=n_jobs)
        else:
            raise AssertionError("unknown columns splitting strategy type %s" % str(cols))
    else:
        split_cols = cols

    if isinstance(rows, str):
        if rows == "rdc":
            split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=n_jobs)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()
        elif rows == "tsne":
            split_rows = get_split_rows_TSNE()
        elif rows == "gmm":
            split_rows = get_split_rows_GMM()
        else:
            raise AssertionError("unknown rows splitting strategy type %s" % str(rows))
    else:
        split_rows = rows
    return split_cols, split_rows
示例#2
0
    def learn(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe):
        split_cols = None
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        if rows == "kmeans":
            split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
示例#3
0
    def learn_param(data, ds_context, cols, rows, min_instances_slice,
                    threshold, ohe):
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold,
                                               rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        if rows == "rdc":
            split_rows = get_split_rows_RDC_py(rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop)
示例#4
0
    def l_mspn_missing(data, ds_context, cols, rows, min_instances_slice,
                       threshold, linear, ohe):
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold,
                                               rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        if rows == "rdc":
            split_rows = get_split_rows_RDC_py(rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()

        if leaves is None:
            leaves = create_histogram_leaf

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop)
示例#5
0
    def learn(data, ds_context, min_instances_slice, threshold, linear, ohe, rand_gen=None):

        if rand_gen is None:
            rand_gen = np.random.RandomState(17)

        ds_context.rand_gen = rand_gen

        #
        # FIXME: adopt the python version of RDC, allowing to deal with missing values
        # split_cols = get_split_cols_RDC(threshold, ohe, linear)
        split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        # get_split_rows_RDC(n_clusters=2, k=10, s=1 / 6, ohe=True, seed=rand_gen)

        leaves = create_type_leaf

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
示例#6
0
def build_spn(numpy_data, feature_types, spn_params, rand_gen):
    
    from spn.algorithms.StructureLearning import get_next_operation, learn_structure
    from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py

    from spn.structure.leaves.parametric.Parametric import Categorical
    from spn.structure.leaves.piecewise.PiecewiseLinear import create_piecewise_leaf
    from spn.experiments.AQP.leaves.identity.IdentityNumeric import create_identity_leaf


    #cast may not be necessary
    numpy_data = np.array(numpy_data, np.float64)
    
    #Generate meta_type array
    meta_types = []
    for feature_type in feature_types:
        if feature_type == "discrete":
            meta_types.append(MetaType.DISCRETE)
        elif feature_type == "continuous":
            meta_types.append(MetaType.REAL)
        else:
            raise Exception("Unknown feature type for SPN: " + feature_type)
    
    #Create information about the domains
    domains = []
    for col in range(numpy_data.shape[1]):
        feature_type = feature_types[col]
        if feature_type == 'continuous':
            domains.append([np.min(numpy_data[:, col]), np.max(numpy_data[:, col])])
        elif feature_type in {'discrete', 'categorical'}:
            domains.append(np.unique(numpy_data[:, col]))
    
    #Create context
    ds_context = Context(meta_types=meta_types, domains=domains)
        
    #Fixed parameters
    rdc_threshold = spn_params["rdc_threshold"]
    cols = spn_params["cols"]
    rows = spn_params["rows"]
    min_instances_slice = spn_params["min_instances_slice"]
    ohe = spn_params["ohe"]
    prior_weight = spn_params["prior_weight"]
    identity_numeric = spn_params["identity_numeric"]
    
    #Method to create leaves in the SPN
    def create_leaf(data, ds_context, scope):
        idx = scope[0]
        meta_type = ds_context.meta_types[idx]
        
        if meta_type == MetaType.REAL:
            if identity_numeric:
                return create_identity_leaf(data, scope)
        
            if prior_weight == 0.:
                return create_piecewise_leaf(data, ds_context, scope, prior_weight=None)
            else:
                return create_piecewise_leaf(data, ds_context, scope, prior_weight=prior_weight)
            

        elif meta_type == MetaType.DISCRETE:
            
            unique, counts = np.unique(data[:,0], return_counts=True)
            
            sorted_counts = np.zeros(len(ds_context.domains[idx]), dtype=np.float64)
            for i, x in enumerate(unique):
                sorted_counts[int(x)] = counts[i] 
            
            p = sorted_counts / data.shape[0]
            
            #Do regularization
            if prior_weight > 0.:
                p += prior_weight
            p = p/np.sum(p)
            
            return Categorical(p, scope)

        else:
            raise Exception("Mehtod learn_mspn_for_aqp(...) cannot create leaf for " + str(meta_type))
    
    #Set method to create leaves
    leaves = create_leaf
    
    #Set methods to cluster and to do the independence test
    if cols == "rdc":
        #split_cols = get_split_cols_RDC(rdc_threshold, ohe=ohe, linear=True)
        split_cols = get_split_cols_RDC_py(rdc_threshold, ohe=ohe, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        
    if rows == "rdc":
        #split_rows = get_split_rows_RDC(ohe=ohe)
        split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=ohe, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        
    #This choses which operation is performed
    nextop = get_next_operation(min_instances_slice)
    
    #Learn the SPN
    root_node = learn_structure(numpy_data, ds_context, split_rows, split_cols, leaves, nextop)
    
    return root_node