Пример #1
0
def get_splitting_functions(cols, rows, ohe, threshold, rand_gen, n_jobs):
    from spn.algorithms.splitting.Clustering import get_split_rows_KMeans, get_split_rows_TSNE, get_split_rows_GMM
    from spn.algorithms.splitting.PoissonStabilityTest import get_split_cols_poisson_py
    from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py

    if isinstance(cols, str):
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=n_jobs)
        elif cols == "poisson":
            split_cols = get_split_cols_poisson_py(threshold, n_jobs=n_jobs)
        else:
            raise AssertionError("unknown columns splitting strategy type %s" % str(cols))
    else:
        split_cols = cols

    if isinstance(rows, str):
        if rows == "rdc":
            split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=n_jobs)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()
        elif rows == "tsne":
            split_rows = get_split_rows_TSNE()
        elif rows == "gmm":
            split_rows = get_split_rows_GMM()
        else:
            raise AssertionError("unknown rows splitting strategy type %s" % str(rows))
    else:
        split_rows = rows
    return split_cols, split_rows
Пример #2
0
    def learn(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe):
        split_cols = None
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        if rows == "kmeans":
            split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
Пример #3
0
    def learn_param(data, ds_context, cols, rows, min_instances_slice,
                    threshold, ohe):
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold,
                                               rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        if rows == "rdc":
            split_rows = get_split_rows_RDC_py(rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop)
Пример #4
0
    def l_mspn_missing(data, ds_context, cols, rows, min_instances_slice,
                       threshold, linear, ohe):
        if cols == "rdc":
            split_cols = get_split_cols_RDC_py(threshold,
                                               rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        if rows == "rdc":
            split_rows = get_split_rows_RDC_py(rand_gen=rand_gen,
                                               ohe=ohe,
                                               n_jobs=cpus)
        elif rows == "kmeans":
            split_rows = get_split_rows_KMeans()

        if leaves is None:
            leaves = create_histogram_leaf

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols,
                               leaves, nextop)
Пример #5
0
    def learn(data, ds_context, min_instances_slice, threshold, linear, ohe, rand_gen=None):

        if rand_gen is None:
            rand_gen = np.random.RandomState(17)

        ds_context.rand_gen = rand_gen

        #
        # FIXME: adopt the python version of RDC, allowing to deal with missing values
        # split_cols = get_split_cols_RDC(threshold, ohe, linear)
        split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        # get_split_rows_RDC(n_clusters=2, k=10, s=1 / 6, ohe=True, seed=rand_gen)

        leaves = create_type_leaf

        nextop = get_next_operation(min_instances_slice)

        return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
Пример #6
0
def learn_spmn_structure(train_data, index, scope_index, params):


    train_data = train_data
    curr_var_set = params.partial_order[index]

    if params.partial_order[index][0] in  params.decision_nodes:

        decision_node = params.partial_order[index][0]
        cl, dec_vals= split_on_decision_node(train_data, curr_var_set)
        spn0 = []
        index= index+1
        set_next_operation("None")

        for c in cl:

            if index < len(params.partial_order):

                spn0.append(learn_spmn_structure(c, index, scope_index, params))
                spn = Max(dec_values=dec_vals, children=spn0, feature_name=decision_node)

            else:
                spn = Max(dec_values=dec_vals, children=None, feature_name=decision_node)

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)
        return spn



    else:

        curr_train_data_prod, curr_train_data = get_curr_train_data_prod(train_data, curr_var_set)

        split_cols = get_split_cols_RDC_py()
        scope_prod = get_scope_prod(curr_train_data_prod, scope_index, params.feature_names)

        ds_context_prod = get_ds_context_prod(curr_train_data_prod, scope_prod, index, scope_index, params)

        data_slices_prod = split_cols(curr_train_data_prod, ds_context_prod, scope_prod)
        curr_op = get_next_operation()


        if len(data_slices_prod)>1 or curr_op == "Prod" or index == len(params.partial_order) :
            set_next_operation("Sum")

            if params.util_to_bin :

                spn0 = learn_parametric(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope= scope_prod)

            else:

                spn0 = learn_mspn(curr_train_data_prod, ds_context_prod, min_instances_slice=20,
                                    initial_scope=scope_prod)

            index = index + 1
            scope_index = scope_index +curr_train_data_prod.shape[1]

            if index < len(params.partial_order):

                spn1 = learn_spmn_structure(curr_train_data, index, scope_index, params)
                spn = Product(children=[spn0, spn1])

                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

            else:
                spn = spn0
                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

        else:

            split_rows = get_split_rows_KMeans()
            scope_sum = list(range(train_data.shape[1]))

            ds_context_sum = get_ds_context_sum(train_data, scope_sum, index, scope_index, params)
            data_slices_sum = split_rows(train_data, ds_context_sum, scope_sum)

            spn0 = []
            weights = []
            index = index

            if index < len(params.partial_order):

                for cl, scop, weight in data_slices_sum:

                    set_next_operation("Prod")
                    spn0.append(learn_spmn_structure(cl, index, scope_index, params))
                    weights.append(weight)

                spn = Sum(weights=weights, children=spn0)
                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)
        return spn
Пример #7
0
    #
    # if compress:
    #     node = Compress(node)
    # if prune:
    #     node = Prune(node)
    # # if validate:
    #     valid, err = is_valid(node)
    #     assert valid, "invalid spn: " + err

    return node


if __name__ == "__main__":
    train_data = np.c_[np.r_[np.random.normal(5, 1, (500, 2)),
                             np.random.normal(10, 1, (500, 2))],
                       np.r_[np.zeros((500, 1)),
                             np.ones((500, 1))], ]
    spn = learn_structure(
        train_data,
        Context(parametric_types=[Gaussian, Gaussian, Categorical
                                  ]).add_domains(train_data),
        scope=list(range(train_data.shape[1])),
        split_rows=get_split_rows_KMeans(),
        split_cols=get_split_cols_RDC_py(),
        create_leaf=create_parametric_leaf,
    )

    from spn.io.plot import TreeVisualization

    TreeVisualization.plot_spn(spn, file_name="tree_spn2.png")
Пример #8
0
def build_spn(numpy_data, feature_types, spn_params, rand_gen):
    
    from spn.algorithms.StructureLearning import get_next_operation, learn_structure
    from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py

    from spn.structure.leaves.parametric.Parametric import Categorical
    from spn.structure.leaves.piecewise.PiecewiseLinear import create_piecewise_leaf
    from spn.experiments.AQP.leaves.identity.IdentityNumeric import create_identity_leaf


    #cast may not be necessary
    numpy_data = np.array(numpy_data, np.float64)
    
    #Generate meta_type array
    meta_types = []
    for feature_type in feature_types:
        if feature_type == "discrete":
            meta_types.append(MetaType.DISCRETE)
        elif feature_type == "continuous":
            meta_types.append(MetaType.REAL)
        else:
            raise Exception("Unknown feature type for SPN: " + feature_type)
    
    #Create information about the domains
    domains = []
    for col in range(numpy_data.shape[1]):
        feature_type = feature_types[col]
        if feature_type == 'continuous':
            domains.append([np.min(numpy_data[:, col]), np.max(numpy_data[:, col])])
        elif feature_type in {'discrete', 'categorical'}:
            domains.append(np.unique(numpy_data[:, col]))
    
    #Create context
    ds_context = Context(meta_types=meta_types, domains=domains)
        
    #Fixed parameters
    rdc_threshold = spn_params["rdc_threshold"]
    cols = spn_params["cols"]
    rows = spn_params["rows"]
    min_instances_slice = spn_params["min_instances_slice"]
    ohe = spn_params["ohe"]
    prior_weight = spn_params["prior_weight"]
    identity_numeric = spn_params["identity_numeric"]
    
    #Method to create leaves in the SPN
    def create_leaf(data, ds_context, scope):
        idx = scope[0]
        meta_type = ds_context.meta_types[idx]
        
        if meta_type == MetaType.REAL:
            if identity_numeric:
                return create_identity_leaf(data, scope)
        
            if prior_weight == 0.:
                return create_piecewise_leaf(data, ds_context, scope, prior_weight=None)
            else:
                return create_piecewise_leaf(data, ds_context, scope, prior_weight=prior_weight)
            

        elif meta_type == MetaType.DISCRETE:
            
            unique, counts = np.unique(data[:,0], return_counts=True)
            
            sorted_counts = np.zeros(len(ds_context.domains[idx]), dtype=np.float64)
            for i, x in enumerate(unique):
                sorted_counts[int(x)] = counts[i] 
            
            p = sorted_counts / data.shape[0]
            
            #Do regularization
            if prior_weight > 0.:
                p += prior_weight
            p = p/np.sum(p)
            
            return Categorical(p, scope)

        else:
            raise Exception("Mehtod learn_mspn_for_aqp(...) cannot create leaf for " + str(meta_type))
    
    #Set method to create leaves
    leaves = create_leaf
    
    #Set methods to cluster and to do the independence test
    if cols == "rdc":
        #split_cols = get_split_cols_RDC(rdc_threshold, ohe=ohe, linear=True)
        split_cols = get_split_cols_RDC_py(rdc_threshold, ohe=ohe, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        
    if rows == "rdc":
        #split_rows = get_split_rows_RDC(ohe=ohe)
        split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=ohe, k=10, s=1 / 6,
                                           non_linearity=np.sin, n_jobs=1,
                                           rand_gen=rand_gen)
        
    #This choses which operation is performed
    nextop = get_next_operation(min_instances_slice)
    
    #Learn the SPN
    root_node = learn_structure(numpy_data, ds_context, split_rows, split_cols, leaves, nextop)
    
    return root_node
Пример #9
0
    def __learn_spmn_structure(self, remaining_vars_data, remaining_vars_scope,
                               curr_information_set_scope, index):

        logging.info(
            f'start of new recursion in __learn_spmn_structure method of SPMN')
        logging.debug(f'remaining_vars_scope: {remaining_vars_scope}')
        logging.debug(
            f'curr_information_set_scope: {curr_information_set_scope}')

        # rest set is remaining variables excluding the variables in current information set
        rest_set_scope = [
            var_scope for var_scope in remaining_vars_scope
            if var_scope not in curr_information_set_scope
        ]

        logging.debug(f'rest_set_scope: {rest_set_scope}')

        scope_index = sum([len(x) for x in self.params.partial_order[:index]])
        next_scope_index = sum(
            [len(x) for x in self.params.partial_order[:index + 1]])

        if remaining_vars_scope == curr_information_set_scope:
            # this is last information set in partial order. Base case of recursion

            # test if current information set is a decision node
            if self.params.partial_order[index][
                    0] in self.params.decision_nodes:
                raise Exception(
                    f'last information set of partial order either contains random '
                    f'and utility variables or just a utility variable. '
                    f'This contains decision variable: {self.params.partial_order[index][0]}'
                )

            else:
                # contains just the random and utility variables

                logging.info(
                    f'at last information set of this recursive call: {curr_information_set_scope}'
                )
                ds_context_last_information_set = get_ds_context(
                    remaining_vars_data, remaining_vars_scope, self.params)

                if self.params.util_to_bin:

                    last_information_set_spn = learn_parametric(
                        remaining_vars_data,
                        ds_context_last_information_set,
                        min_instances_slice=20,
                        initial_scope=remaining_vars_scope)

                else:

                    last_information_set_spn = learn_mspn_for_spmn(
                        remaining_vars_data,
                        ds_context_last_information_set,
                        min_instances_slice=20,
                        initial_scope=remaining_vars_scope)

            logging.info(f'created spn at last information set')
            return last_information_set_spn

        # test for decision node. test if current information set is a decision node
        elif self.params.partial_order[index][0] in self.params.decision_nodes:

            decision_node = self.params.partial_order[index][0]

            logging.info(f'Encountered Decision Node: {decision_node}')

            # cluster the data from remaining variables w.r.t values of decision node
            clusters_on_next_remaining_vars, dec_vals = split_on_decision_node(
                remaining_vars_data)

            decision_node_children_spns = []
            index += 1

            next_information_set_scope = np.array(
                range(next_scope_index, next_scope_index +
                      len(self.params.partial_order[index]))).tolist()

            next_remaining_vars_scope = rest_set_scope
            self.set_next_operation('Any')

            logging.info(f'split clusters based on decision node values')
            for cluster_on_next_remaining_vars in clusters_on_next_remaining_vars:

                decision_node_children_spns.append(
                    self.__learn_spmn_structure(cluster_on_next_remaining_vars,
                                                next_remaining_vars_scope,
                                                next_information_set_scope,
                                                index))

            decision_node_spn_branch = Max(
                dec_idx=scope_index,
                dec_values=dec_vals,
                children=decision_node_children_spns,
                feature_name=decision_node)

            assign_ids(decision_node_spn_branch)
            rebuild_scopes_bottom_up(decision_node_spn_branch)
            logging.info(f'created decision node')
            return decision_node_spn_branch

        # testing for independence
        else:

            curr_op = self.get_curr_operation()
            logging.debug(
                f'curr_op at prod node (independence test): {curr_op}')

            if curr_op != 'Sum':  # fails if correlated variable set found in previous recursive call.
                # Without this condition code keeps looping at this stage

                ds_context = get_ds_context(remaining_vars_data,
                                            remaining_vars_scope, self.params)

                split_cols = get_split_cols_RDC_py()
                data_slices_prod = split_cols(remaining_vars_data, ds_context,
                                              remaining_vars_scope)

                logging.debug(
                    f'{len(data_slices_prod)} slices found at data_slices_prod: '
                )

                prod_children = []
                next_remaining_vars_scope = []
                independent_vars_scope = []

                for correlated_var_set_cluster, correlated_var_set_scope, weight in data_slices_prod:

                    if any(var_scope in correlated_var_set_scope
                           for var_scope in rest_set_scope):

                        next_remaining_vars_scope.extend(
                            correlated_var_set_scope)

                    else:
                        # this variable set of current information set is
                        # not correlated to any variable in the rest set

                        logging.info(
                            f'independent variable set found: {correlated_var_set_scope}'
                        )

                        ds_context_prod = get_ds_context(
                            correlated_var_set_cluster,
                            correlated_var_set_scope, self.params)

                        if self.params.util_to_bin:

                            independent_var_set_prod_child = learn_parametric(
                                correlated_var_set_cluster,
                                ds_context_prod,
                                min_instances_slice=20,
                                initial_scope=correlated_var_set_scope)

                        else:

                            independent_var_set_prod_child = learn_mspn_for_spmn(
                                correlated_var_set_cluster,
                                ds_context_prod,
                                min_instances_slice=20,
                                initial_scope=correlated_var_set_scope)
                        independent_vars_scope.extend(correlated_var_set_scope)
                        prod_children.append(independent_var_set_prod_child)

                logging.info(
                    f'correlated variables over entire remaining variables '
                    f'at prod, passed for next recursion: '
                    f'{next_remaining_vars_scope}')

                # check if all variables in current information set are consumed
                if all(var_scope in independent_vars_scope
                       for var_scope in curr_information_set_scope):

                    index += 1
                    next_information_set_scope = np.array(
                        range(
                            next_scope_index, next_scope_index +
                            len(self.params.partial_order[index]))).tolist()

                    # since current information set is totally consumed
                    next_remaining_vars_scope = rest_set_scope

                else:
                    # some variables in current information set still remain
                    index = index

                    next_information_set_scope = set(
                        curr_information_set_scope) - set(
                            independent_vars_scope)
                    next_remaining_vars_scope = next_information_set_scope | set(
                        rest_set_scope)

                    # convert unordered sets of scope to sorted lists to keep in sync with partial order
                    next_information_set_scope = sorted(
                        list(next_information_set_scope))
                    next_remaining_vars_scope = sorted(
                        list(next_remaining_vars_scope))

                self.set_next_operation('Sum')

                next_remaining_vars_data = column_slice_data_by_scope(
                    remaining_vars_data, remaining_vars_scope,
                    next_remaining_vars_scope)

                logging.info(
                    f'independence test completed for current information set {curr_information_set_scope} '
                    f'and rest set {rest_set_scope} ')

                remaining_vars_prod_child = self.__learn_spmn_structure(
                    next_remaining_vars_data, next_remaining_vars_scope,
                    next_information_set_scope, index)

                prod_children.append(remaining_vars_prod_child)

                product_node = Product(children=prod_children)
                assign_ids(product_node)
                rebuild_scopes_bottom_up(product_node)

                logging.info(f'created product node')
                return product_node

            # Cluster the data
            else:

                curr_op = self.get_curr_operation()
                logging.debug(f'curr_op at sum node (cluster test): {curr_op}')

                split_rows = get_split_rows_KMeans()  # from SPMNHelper.py

                if self.cluster_by_curr_information_set:

                    curr_information_set_data = column_slice_data_by_scope(
                        remaining_vars_data, remaining_vars_scope,
                        curr_information_set_scope)

                    ds_context_sum = get_ds_context(
                        curr_information_set_data, curr_information_set_scope,
                        self.params)
                    data_slices_sum, km_model = split_rows(
                        curr_information_set_data, ds_context_sum,
                        curr_information_set_scope)

                    logging.info(
                        f'split clusters based on current information set {curr_information_set_scope}'
                    )

                else:
                    # cluster on whole remaining variables
                    ds_context_sum = get_ds_context(remaining_vars_data,
                                                    remaining_vars_scope,
                                                    self.params)
                    data_slices_sum, km_model = split_rows(
                        remaining_vars_data, ds_context_sum,
                        remaining_vars_scope)

                    logging.info(
                        f'split clusters based on whole remaining variables {remaining_vars_scope}'
                    )

                sum_node_children = []
                weights = []
                index = index
                logging.debug(
                    f'{len(data_slices_sum)} clusters found at data_slices_sum'
                )

                cluster_num = 0
                labels_array = km_model.labels_
                logging.debug(
                    f'cluster labels of rows: {labels_array} used to cluster data on '
                    f'total remaining variables {remaining_vars_scope}')

                for cluster, scope, weight in data_slices_sum:

                    self.set_next_operation("Prod")

                    # cluster whole remaining variables based on clusters formed.
                    # below methods are useful if clusters were formed on just the current information set

                    cluster_indices = get_row_indices_of_cluster(
                        labels_array, cluster_num)
                    cluster_on_remaining_vars = row_slice_data_by_indices(
                        remaining_vars_data, cluster_indices)

                    # logging.debug(np.array_equal(cluster_on_remaining_vars, cluster ))

                    sum_node_children.append(
                        self.__learn_spmn_structure(
                            cluster_on_remaining_vars, remaining_vars_scope,
                            curr_information_set_scope, index))

                    weights.append(weight)

                    cluster_num += 1

                sum_node = Sum(weights=weights, children=sum_node_children)

                assign_ids(sum_node)
                rebuild_scopes_bottom_up(sum_node)
                logging.info(f'created sum node')
                return sum_node