Пример #1
0
    def learn_cond(data, ds_context, scope, cols, rows, min_instances_slice,
                   threshold, ohe):
        split_cols = None
        if cols == "ci":
            from spn.algorithms.splitting.RCoT import getCIGroup

            split_cols = getCIGroup(
                np.random.RandomState(17))  #(data, scope, threshold)
        else:
            raise ValueError('invalid independence test')
        if rows == "rand_hp":
            from spn.algorithms.splitting.Random import get_split_rows_random_partition

            split_rows = get_split_rows_random_partition(
                np.random.RandomState(17))  #(data, scope, threshold)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()
        else:
            # todo add other clustering?
            raise ValueError('invalid clustering method')

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop, scope)
Пример #2
0
    def l_mspn(data, ds_context, cols, rows, min_instances_slice, threshold,
               ohe):
        split_cols, split_rows = get_splitting_functions(
            cols, rows, ohe, threshold, rand_gen, cpus)

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop)
Пример #3
0
    def learn_param(data, ds_context, cols, rows, min_instances_slice,
                    threshold, ohe):
        split_cols, split_rows = get_splitting_functions(
            cols, rows, ohe, threshold, rand_gen, cpus)

        nextop = get_next_operation(min_instances_slice, min_features_slice,
                                    multivariate_leaf, cluster_univariate)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop)
Пример #4
0
    def learn(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe):
        split_cols = None
        if cols == "rdc":
            split_cols = get_split_cols_RDC(threshold, ohe, linear)
        if rows == "kmeans":
            split_rows = get_split_rows_KMeans()

        leaves = create_histogram_leaf

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
Пример #5
0
    def learn(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe):
        split_cols = None
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        if rows == "kmeans":
            split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
Пример #6
0
def learn(data,
          ds_context,
          min_instances_slice=200,
          threshold=0.00000001,
          linear=False):
    split_cols = lambda data, ds_context, scope: split_cols_RDC(
        data, ds_context, scope, threshold=threshold, linear=linear)
    nextop = lambda data, no_clusters=False, no_independencies=False, is_first=False, cluster_first=True, cluster_univariate=False, min_instances_slice=min_instances_slice: next_operation(
        data, no_clusters, no_independencies, is_first, cluster_first,
        cluster_univariate, min_instances_slice)

    spn = learn_structure(data, ds_context, split_rows_KMeans, split_cols,
                          create_histogram_leaf, nextop)

    return spn
Пример #7
0
    def learn(data, ds_context, min_instances_slice, rand_gen):

        if rand_gen is None:
            rand_gen = np.random.RandomState(17)

        ds_context.rand_gen = rand_gen

        split_cols = get_split_cols_binary_random_partition(threshold=col_threshold,
                                                            beta_a=col_a, beta_b=col_b)
        splot_rows = get_split_rows_binary_random_partition(beta_a=row_a, beta_b=row_b)

        # leaves = create_random_parametric_leaf
        leaves = create_random_unconstrained_type_mixture_leaf

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, splot_rows, split_cols, leaves, nextop)
Пример #8
0
    def learn_param(data, ds_context, cols, rows, min_instances_slice,
                    threshold, ohe, initial_scope, l_rfft, is_2d):
        split_cols, split_rows = get_splitting_functions(
            cols, rows, ohe, threshold, rand_gen, cpus)

        nextop = get_next_operation(min_instances_slice, min_features_slice,
                                    multivariate_leaf)

        return learn_structure(data,
                               ds_context,
                               split_rows,
                               split_cols,
                               leaves,
                               nextop,
                               initial_scope,
                               l_rfft=l_rfft,
                               is_2d=is_2d)
Пример #9
0
    def learn_param(data, ds_context, cols, rows, min_instances_slice,
                    threshold, ohe):
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold,
                                               rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        if rows == "rdc":
            split_rows = get_split_rows_RDC_py(rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop)
Пример #10
0
    def l_mspn_missing(data, ds_context, cols, rows, min_instances_slice,
                       threshold, linear, ohe):
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold,
                                               rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        if rows == "rdc":
            split_rows = get_split_rows_RDC_py(rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()

        if leaves is None:
            leaves = create_histogram_leaf

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop)
Пример #11
0
    def learn(data, ds_context, min_instances_slice, threshold, linear, ohe, rand_gen=None):

        if rand_gen is None:
            rand_gen = np.random.RandomState(17)

        ds_context.rand_gen = rand_gen

        #
        # FIXME: adopt the python version of RDC, allowing to deal with missing values
        # split_cols = get_split_cols_RDC(threshold, ohe, linear)
        split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        # get_split_rows_RDC(n_clusters=2, k=10, s=1 / 6, ohe=True, seed=rand_gen)

        leaves = create_type_leaf

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
Пример #12
0
@author: Alejandro Molina
'''

from spn.algorithms import Inference
from spn.algorithms.StructureLearning import learn_structure
from spn.algorithms.splitting.Clustering import get_split_rows_KMeans
from spn.algorithms.splitting.RDC import get_split_cols_RDC
from spn.data.datasets import get_nips_data
from spn.structure.Base import Context
from spn.structure.leaves.Histograms import add_domains, create_histogram_leaf

if __name__ == '__main__':
    import numpy as np

    ds_name, words, data, train, _, statistical_type, _ = get_nips_data()

    print(words)

    print(data)

    ds_context = Context()
    ds_context.statistical_type = np.asarray(["discrete"] * data.shape[1])

    add_domains(data, ds_context)

    spn = learn_structure(data, ds_context, get_split_rows_KMeans(),
                          get_split_cols_RDC(), create_histogram_leaf)

    # print(to_str_equation(spn, words))
    print(Inference.likelihood(spn, data[0:100, :]))
Пример #13
0
def build_spn(numpy_data, feature_types, spn_params, rand_gen):
    
    from spn.algorithms.StructureLearning import get_next_operation, learn_structure
    from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py

    from spn.structure.leaves.parametric.Parametric import Categorical
    from spn.structure.leaves.piecewise.PiecewiseLinear import create_piecewise_leaf
    from spn.experiments.AQP.leaves.identity.IdentityNumeric import create_identity_leaf


    #cast may not be necessary
    numpy_data = np.array(numpy_data, np.float64)
    
    #Generate meta_type array
    meta_types = []
    for feature_type in feature_types:
        if feature_type == "discrete":
            meta_types.append(MetaType.DISCRETE)
        elif feature_type == "continuous":
            meta_types.append(MetaType.REAL)
        else:
            raise Exception("Unknown feature type for SPN: " + feature_type)
    
    #Create information about the domains
    domains = []
    for col in range(numpy_data.shape[1]):
        feature_type = feature_types[col]
        if feature_type == 'continuous':
            domains.append([np.min(numpy_data[:, col]), np.max(numpy_data[:, col])])
        elif feature_type in {'discrete', 'categorical'}:
            domains.append(np.unique(numpy_data[:, col]))
    
    #Create context
    ds_context = Context(meta_types=meta_types, domains=domains)
        
    #Fixed parameters
    rdc_threshold = spn_params["rdc_threshold"]
    cols = spn_params["cols"]
    rows = spn_params["rows"]
    min_instances_slice = spn_params["min_instances_slice"]
    ohe = spn_params["ohe"]
    prior_weight = spn_params["prior_weight"]
    identity_numeric = spn_params["identity_numeric"]
    
    #Method to create leaves in the SPN
    def create_leaf(data, ds_context, scope):
        idx = scope[0]
        meta_type = ds_context.meta_types[idx]
        
        if meta_type == MetaType.REAL:
            if identity_numeric:
                return create_identity_leaf(data, scope)
        
            if prior_weight == 0.:
                return create_piecewise_leaf(data, ds_context, scope, prior_weight=None)
            else:
                return create_piecewise_leaf(data, ds_context, scope, prior_weight=prior_weight)
            

        elif meta_type == MetaType.DISCRETE:
            
            unique, counts = np.unique(data[:,0], return_counts=True)
            
            sorted_counts = np.zeros(len(ds_context.domains[idx]), dtype=np.float64)
            for i, x in enumerate(unique):
                sorted_counts[int(x)] = counts[i] 
            
            p = sorted_counts / data.shape[0]
            
            #Do regularization
            if prior_weight > 0.:
                p += prior_weight
            p = p/np.sum(p)
            
            return Categorical(p, scope)

        else:
            raise Exception("Mehtod learn_mspn_for_aqp(...) cannot create leaf for " + str(meta_type))
    
    #Set method to create leaves
    leaves = create_leaf
    
    #Set methods to cluster and to do the independence test
    if cols == "rdc":
        #split_cols = get_split_cols_RDC(rdc_threshold, ohe=ohe, linear=True)
        split_cols = get_split_cols_RDC_py(rdc_threshold, ohe=ohe, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        
    if rows == "rdc":
        #split_rows = get_split_rows_RDC(ohe=ohe)
        split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=ohe, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        
    #This choses which operation is performed
    nextop = get_next_operation(min_instances_slice)
    
    #Learn the SPN
    root_node = learn_structure(numpy_data, ds_context, split_rows, split_cols, leaves, nextop)
    
    return root_node