def test_cltree_node_eval(): # # testing for the correctness of data masking while evaluating # shall I move this test to CLTree? s_data = numpy.random.binomial(n=1, p=0.5, size=(1000, 100)) print(s_data) random_features = numpy.random.choice(s_data.shape[1], 20) sub_s_data = s_data[:, random_features] clt_node = CLTreeNode(vars=random_features, data=sub_s_data, var_values=numpy.array([2 for i in range(sub_s_data.shape[1])])) # evaluating on all s_data lls = [] for instance in s_data: clt_node.eval(instance) lls.append(clt_node.log_val) # # now do one on the only sub clt_node = CLTreeNode(vars=[i for i in range(sub_s_data.shape[1])], data=sub_s_data, var_values=numpy.array([2 for i in range(sub_s_data.shape[1])])) lls_s = [] for instance in sub_s_data: clt_node.eval(instance) lls_s.append(clt_node.log_val) # print(lls) # print(lls_s) assert_array_almost_equal(numpy.array(lls), numpy.array(lls_s))
def test_categorical_clt_input_layer_eval(): # # just a little test to see how mixed nodes layers # dispatch eval s_data = numpy.array([[1, 1, 1, 0], [0, 0, 1, 0], [0, 0, 1, 0], [1, 1, 0, 0], [1, 0, 1, 0], [0, 1, 1, 0]]) features = [0, 2, 3] feature_vals = [2, 2, 2] clt_node = CLTreeNode(data=s_data[:, features], vars=features, var_values=feature_vals, alpha=0.0) # # creating a categorical smoothed node cs_node = CategoricalSmoothedNode(var=1, var_values=2, alpha=0., data=s_data[:, [1]]) clti_layer = CategoricalCLInputLayer(nodes=[cs_node, clt_node]) nico_cltree_subtree = numpy.array([-1, 0, 1]) nico_cltree_sublls = numpy.array([-1.09861228867, -0.69314718056, -0.69314718056, -1.79175946923, -1.09861228867, -0.69314718056]) assert_array_equal(numpy.array(features), clt_node.vars) assert_array_equal(nico_cltree_subtree, clt_node._cltree._tree) s_log_prob = numpy.log(0.5) # # evaluating the layer for i, instance in enumerate(s_data): clti_layer.eval(instance) for node in clti_layer.nodes(): if isinstance(node, CLTreeNode): assert_almost_equal(nico_cltree_sublls[i], node.log_val) else: # in the other case is 0.5 assert_almost_equal(s_log_prob, node.log_val)
def test_cltree_node_init_and_eval(): s_data = numpy.array([[1, 1, 1, 0], [0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [1, 0, 1, 0], [0, 1, 1, 0]]) features = [0, 1, 2, 3] feature_vals = [2, 2, 2, 2] clt_node = CLTreeNode(data=s_data, vars=features, var_values=feature_vals, alpha=0.0) assert_array_equal(numpy.array(features), clt_node.vars) assert_almost_equal(clt_node._alpha, 0.0) assert_array_almost_equal(s_data, clt_node._data) nico_cltree_tree = numpy.array([-1, 2, 0, 2]) nico_cltree_lls = numpy.array([-2.01490302054, -1.20397280433, -1.20397280433, -1.79175946923, -1.60943791243, -1.60943791243]) assert_array_equal(nico_cltree_tree, clt_node._cltree._tree) print('Created node') print(clt_node) # # evaluating for i, instance in enumerate(s_data): clt_node.eval(instance) assert_almost_equal(nico_cltree_lls[i], clt_node.log_val) print(clt_node.log_val, nico_cltree_lls[i]) # # creating now with a subset from data features = [0, 2, 3] feature_vals = [2, 2, 2] clt_node = CLTreeNode(data=s_data[:, features], vars=features, var_values=feature_vals, alpha=0.0) nico_cltree_subtree = numpy.array([-1, 0, 1]) nico_cltree_sublls = numpy.array([-1.09861228867, -0.69314718056, -0.69314718056, -1.79175946923, -1.09861228867, -0.69314718056]) assert_array_equal(numpy.array(features), clt_node.vars) assert_array_equal(nico_cltree_subtree, clt_node._cltree._tree) for i, instance in enumerate(s_data): clt_node.eval(instance) assert_almost_equal(nico_cltree_sublls[i], clt_node.log_val) print(clt_node.log_val, nico_cltree_sublls[i])
def test_layered_pruned_linked_spn_cltree(): # # creating all the data slices # the slicing is a fake stub rows = 5 cols = 5 var = 1 values = 2 vars = [2, 3] var_values = [2, 2] s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]]) node_1 = SumNode() node_1.id = 1 node_2 = ProductNode() node_2.id = 2 node_3 = SumNode() node_3.id = 3 # adding first level weight_12 = 0.4 weight_13 = 0.6 node_1.add_child(node_2, weight_12) node_1.add_child(node_3, weight_13) node_4 = ProductNode() node_4.id = 4 leaf_5 = CategoricalSmoothedNode(var, values) leaf_5.id = 5 # not adding the slice to the stack node_2.add_child(node_4) node_2.add_child(leaf_5) node_6 = SumNode() node_6.id = 6 node_7 = SumNode() node_7.id = 7 weight_36 = 0.1 weight_37 = 0.9 node_3.add_child(node_6, weight_36) node_3.add_child(node_7, weight_37) node_8 = ProductNode() node_8.id = 8 # # this is a cltree leaf_15 = CLTreeNode(vars=vars, var_values=var_values, data=s_data) leaf_15.id = 15 node_4.add_child(node_8) node_4.add_child(leaf_15) leaf_13 = CategoricalSmoothedNode(var, values) leaf_13.id = 13 leaf_14 = CLTreeNode(vars=vars, var_values=var_values, data=s_data) leaf_14.id = 14 node_8.add_child(leaf_13) node_8.add_child(leaf_14) leaf_9 = CLTreeNode(vars=vars, var_values=var_values, data=s_data) leaf_9.id = 9 node_10 = ProductNode() node_10.id = 10 leaf_18 = CategoricalSmoothedNode(var, values) leaf_18.id = 18 leaf_19 = CategoricalSmoothedNode(var, values) leaf_19.id = 19 node_10.add_child(leaf_18) node_10.add_child(leaf_19) weight_69 = 0.3 weight_610 = 0.7 node_6.add_child(leaf_9, weight_69) node_6.add_child(node_10, weight_610) node_11 = ProductNode() node_11.id = 11 leaf_20 = CategoricalSmoothedNode(var, values) leaf_20.id = 20 leaf_21 = CategoricalSmoothedNode(var, values) leaf_21.id = 21 node_11.add_child(leaf_20) node_11.add_child(leaf_21) node_12 = ProductNode() node_12.id = 12 leaf_22 = CLTreeNode(vars=vars, var_values=var_values, data=s_data) leaf_22.id = 22 leaf_23 = CategoricalSmoothedNode(var, values) leaf_23.id = 23 node_12.add_child(leaf_22) node_12.add_child(leaf_23) weight_711 = 0.5 weight_712 = 0.5 node_7.add_child(node_11, weight_711) node_7.add_child(node_12, weight_712) print('Added nodes') root_node = SpnFactory.layered_pruned_linked_spn(node_1) print('ROOT nODE', root_node) spn = SpnFactory.layered_linked_spn(root_node) print('SPN', spn) assert spn.n_layers() == 3 for i, layer in enumerate(spn.top_down_layers()): if i == 0: assert layer.n_nodes() == 1 elif i == 1: assert layer.n_nodes() == 4 elif i == 2: assert layer.n_nodes() == 10
def fit_structure(self, data, feature_sizes): """ data is a numpy array of size {n_instances X n_features} feature_sizes is an array of integers representing feature ranges """ # # resetting the data slice ids (just in case) DataSlice.reset_id_counter() tot_n_instances = data.shape[0] tot_n_features = data.shape[1] logging.info('Learning SPN structure on a (%d X %d) dataset', tot_n_instances, tot_n_features) learn_start_t = perf_counter() # # a queue containing the data slices to process slices_to_process = deque() # a stack for building nodes building_stack = deque() # a dict to keep track of id->nodes node_id_assoc = {} # creating the first slice whole_slice = DataSlice.whole_slice(tot_n_instances, tot_n_features) slices_to_process.append(whole_slice) first_run = True # # iteratively process & split slices # while slices_to_process: # process a slice current_slice = slices_to_process.popleft() # pointers to the current data slice current_instances = current_slice.instance_ids current_features = current_slice.feature_ids current_id = current_slice.id n_instances = len(current_instances) n_features = len(current_features) logging.info('\n*** Processing slice %d (%d X %d)', current_id, n_instances, n_features) logging.debug('\tinstances:%s\n\tfeatures:%s', current_instances, current_features) # # is this a leaf node or we can split? if n_features == 1: logging.info('---> Adding a leaf (just one feature)') (feature_id, ) = current_features feature_size = feature_sizes[feature_id] # slicing from the original dataset slice_data_rows = data[current_instances, :] current_slice_data = slice_data_rows[:, current_features] # create the node leaf_node = CategoricalSmoothedNode( var=feature_id, var_values=feature_size, data=current_slice_data, instances=current_instances, alpha=self._alpha) # print('lnvf', leaf_node._var_freqs) # storing links # input_nodes.append(leaf_node) leaf_node.id = current_id node_id_assoc[current_id] = leaf_node logging.debug('\tCreated Smooth Node %s', leaf_node) elif (n_instances <= self._min_instances_slice and n_features > 1): # # splitting the slice on each feature logging.info('---> Few instances (%d), decompose all features', n_instances) # # shall put a cltree or if self._cltree_leaves: logging.info('into a Chow-Liu tree') # # slicing data slice_data_rows = data[current_instances, :] current_slice_data = slice_data_rows[:, current_features] current_feature_sizes = [ feature_sizes[i] for i in current_features ] # # creating a Chow-Liu tree as leaf leaf_node = CLTreeNode(vars=current_features, var_values=current_feature_sizes, data=current_slice_data, alpha=self._alpha) # # storing links leaf_node.id = current_id node_id_assoc[current_id] = leaf_node logging.debug('\tCreated Chow-Liu Tree Node %s', leaf_node) elif self._kde and n_instances > 1: estimate_kernel_density_spn(current_slice, feature_sizes, data, self._alpha, node_id_assoc, building_stack, slices_to_process) # elif n_instances == 1: # FIXME: there is a bug here else: current_slice, slices_to_process, building_stack, node_id_assoc = \ self.make_naive_factorization(current_slice, slices_to_process, building_stack, node_id_assoc) else: # # slicing from the original dataset slice_data_rows = data[current_instances, :] current_slice_data = slice_data_rows[:, current_features] split_on_features = False # # first run is a split on rows if first_run: logging.info('-- FIRST RUN --') first_run = False else: # # try clustering on cols # logging.debug('...trying to split on columns') split_start_t = perf_counter() print(data.shape) dependent_features, other_features = greedy_feature_split( data, current_slice, feature_sizes, self._g_factor, self._rand_gen) split_end_t = perf_counter() logging.info('...tried to split on columns in {}'.format( split_end_t - split_start_t)) if len(other_features) > 0: split_on_features = True # # have dependent components been found? if split_on_features: # # splitting on columns logging.info( '---> Splitting on features' + ' {} -> ({}, {})'.format(len(current_features), len(dependent_features), len(other_features))) # # creating two new data slices and putting them on queue first_slice = DataSlice(current_instances, dependent_features) second_slice = DataSlice(current_instances, other_features) slices_to_process.append(first_slice) slices_to_process.append(second_slice) children_ids = [first_slice.id, second_slice.id] # # storing link parent children current_slice.type = ProductNode building_stack.append(current_slice) current_slice.add_child(first_slice) current_slice.add_child(second_slice) # # creating product node prod_node = ProductNode( var_scope=frozenset(current_features)) prod_node.id = current_id node_id_assoc[current_id] = prod_node logging.debug('\tCreated Prod Node %s (with children %s)', prod_node, children_ids) else: # # clustering on rows logging.info('---> Splitting on rows') # # at most n_rows clusters, for sklearn k_row_clusters = min(self._n_cluster_splits, n_instances - 1) clustering = cluster_rows( data, current_slice, n_clusters=k_row_clusters, cluster_method=self._row_cluster_method, n_iters=self._n_iters, n_restarts=self._n_restarts, cluster_penalty=self._cluster_penalty, rand_gen=self._rand_gen, sklearn_args=self._sklearn_args) if len(clustering) < 2: logging.info('\n\n\nLess than 2 clusters\n\n (%d)', len(clustering)) logging.info('forcing a naive factorization') current_slice, slices_to_process, building_stack, node_id_assoc = \ self.make_naive_factorization(current_slice, slices_to_process, building_stack, node_id_assoc) else: # logging.debug('obtained clustering %s', clustering) logging.info('clustered into %d parts (min %d)', len(clustering), k_row_clusters) # splitting cluster_slices = [ DataSlice(cluster, current_features) for cluster in clustering ] cluster_slices_ids = [ slice.id for slice in cluster_slices ] # cluster_prior = 5.0 # cluster_weights = [(slice.n_instances() + cluster_prior) / # (n_instances + cluster_prior * len(cluster_slices)) # for slice in cluster_slices] cluster_weights = [ slice.n_instances() / n_instances for slice in cluster_slices ] # # appending for processing slices_to_process.extend(cluster_slices) # # storing links # current_slice.children = cluster_slices_ids # current_slice.weights = cluster_weights current_slice.type = SumNode building_stack.append(current_slice) for child_slice, child_weight in zip( cluster_slices, cluster_weights): current_slice.add_child(child_slice, child_weight) # # building a sum node SCOPES_DICT[frozenset(current_features)] += 1 sum_node = SumNode( var_scope=frozenset(current_features)) sum_node.id = current_id node_id_assoc[current_id] = sum_node logging.debug( '\tCreated Sum Node %s (with children %s)', sum_node, cluster_slices_ids) learn_end_t = perf_counter() logging.info('\n\n\tStructure learned in %f secs', (learn_end_t - learn_start_t)) # # linking the spn graph (parent -> children) # logging.info('===> Building tree') link_start_t = perf_counter() root_build_node = building_stack[0] root_node = node_id_assoc[root_build_node.id] logging.debug('root node: %s', root_node) root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc, building_stack) link_end_t = perf_counter() logging.info('\tLinked the spn in %f secs (root_node %s)', (link_end_t - link_start_t), root_node) # # building layers # logging.info('===> Layering spn') layer_start_t = perf_counter() spn = SpnFactory.layered_linked_spn(root_node) layer_end_t = perf_counter() logging.info('\tLayered the spn in %f secs', (layer_end_t - layer_start_t)) logging.info('\nLearned SPN\n\n%s', spn.stats()) #logging.info('%s', SCOPES_DICT.most_common(30)) return spn