def learnStructureHelper(self, dataset, ids): curr_depth = self.nvariables - dataset.shape[1] #print ("curr_dept: ", curr_depth) # Termination condition satisfied, learn the CLTree as a leaf. if dataset.shape[0]<self.min_rec or dataset.shape[1]<self.min_var or curr_depth >= self.depth: clt = CLTree() clt.train(dataset) return clt pairs = utils_common.compute_pairwise_counts(dataset) + 1 # Laplace correction pairs = utils_common.normalize2D(pairs) singles = utils_common.compute_single_counts(dataset) + 2 # laplace correction singles = utils_common.normalize1D(singles) edgemat = utils_common.compute_adjmatrix(pairs, singles) np.fill_diagonal(edgemat, 0) # # scores = np.sum(edgemat, axis=0) variable = np.argmax(scores) # this variable (number) will not correspond exactly to our global list of feature column numbers new_dataset1=np.delete(dataset[dataset[:,variable]==1],variable,1) p1=float(new_dataset1.shape[0])+1.0 new_ids=np.delete(ids,variable,0) new_dataset0 = np.delete(dataset[dataset[:, variable] == 0], variable, 1) p0 = float(new_dataset0.shape[0]) +1.0 # Normalize p0 = p0/(p0+p1) p1 = 1.0 - p0 return [variable,ids[variable],p0,p1,self.learnStructureHelper(new_dataset0,new_ids), self.learnStructureHelper(new_dataset1,new_ids)]
def compute_exact_graph(self, data): pairwise_counts = utils.compute_pairwise_counts(data) + 1 self.prob_pair = utils.normalize2D(pairwise_counts) single_counts = utils.compute_single_counts(data) + 2 self.prob_sing = utils.normalize1D(single_counts) adjmat = utils.compute_adjmatrix(self.prob_pair, self.prob_sing) adjmat[adjmat == 0.0] = 1e-10 # complete graph! return adjmat
def compute_best_attr(self, dataset): pairs = utils.compute_pairwise_counts(dataset) + 1 # Laplace correction pairs = utils.normalize2D(pairs) singles = utils.compute_single_counts(dataset) + 2 # laplace correction singles = utils.normalize1D(singles) edgemat = utils.compute_adjmatrix(pairs, singles) np.fill_diagonal(edgemat, 0) # # scores = np.sum(edgemat, axis=0) variable = np.argmax(scores) # this variable (number) will not correspond exactly to our global list of feature column numbers return variable
def train_weighted(self, weights, data): # weights is a np vector assigning weights to every data-vector in data N = data.shape[0] alpha = max(np.sum(weights), 1) alpha /= N pairwise_counts = utils.compute_pairwise_counts_weighted(data, weights) + alpha self.prob_pair = utils.normalize2D(pairwise_counts) single_counts = utils.compute_single_counts_weighted(data, weights) + 2 * alpha self.prob_sing = utils.normalize1D(single_counts) adjmat = utils.compute_adjmatrix(self.prob_pair, self.prob_sing) adjmat *= -1.0 # making negative for MST calc adjmat[adjmat == 0.0] = 1e-10 mstree = minimum_spanning_tree(csr_matrix(adjmat)) self.node_order, self.parent = depth_first_order(mstree, 0, directed=False)
def compute_approx_graph(self, data, samp_k): nvars = data.shape[1] adjmat = np.zeros((nvars, nvars)) single_counts = utils.compute_single_counts(data) + 2 self.prob_sing = utils.normalize1D(single_counts) pairprob_arr = np.zeros((nvars, nvars, 2, 2)) nodes = [i for i in range(nvars)] curr_node = utils.select_nodes(nodes) nodes_in_tree = [curr_node] nodes.remove(curr_node) steps = 0 flag = 0 # all nodes visited? while nodes or steps < nvars-1: # while nodes: if flag == 1: # If all nodes exhausted, just select from all nodes except the current! tmp_nodes = [i for i in range(nvars) if i != curr_node] candidates = utils.select_nodes(tmp_nodes, num=samp_k) elif len(nodes) <= samp_k: # Stop sampling! candidates = nodes else: candidates = utils.select_nodes(nodes, num=samp_k) for can in candidates: score = utils.compute_mutualinfo(data, curr_node, can) adjmat[curr_node, can] = score adjmat[can, curr_node] = score # Set the CPTs corresponding to this pair (curr_node, can) cpt_xy, cpt_yx = utils.compute_cpt_xy(data, curr_node, can) pairprob_arr[curr_node, can, :, :] = cpt_xy pairprob_arr[can, curr_node, :, :] = cpt_yx # update remaining nodes list if flag == 0: # not visited all the nodes nodes_in_tree.extend(candidates) nodes = [ele for ele in nodes if ele not in candidates] if not nodes: flag = 1 steps += 1 # print(curr_node, nodes, candidates, nodes_in_tree, steps) # Reset the curr_node curr_node = utils.select_nodes(nodes_in_tree) self.prob_pair = pairprob_arr # For inference!! adjmat += np.identity(nvars) return adjmat
def build_tree(self, dataset, ids): curr_depth = self.nvariables - dataset.shape[1] #print ("curr_dept: ", curr_depth) # Termination condition satisfied, learn the CLTree as a leaf. if dataset.shape[0]<self.min_rec or dataset.shape[1]<self.min_var or curr_depth >= self.depth: clt = CLTree() clt.train(dataset) leaf_node = CnetTreeNode(clt) # for leaf node, the var = clt tree object, not a feature col num. return leaf_node pairs = utils_common.compute_pairwise_counts(dataset) + 1 # Laplace correction pairs = utils_common.normalize2D(pairs) singles = utils_common.compute_single_counts(dataset) + 2 # laplace correction singles = utils_common.normalize1D(singles) edgemat = utils_common.compute_adjmatrix(pairs, singles) np.fill_diagonal(edgemat, 0) # # scores = np.sum(edgemat, axis=0) variable = np.argmax(scores) # this variable (number) will not correspond exactly to our global list of feature column numbers root_node = CnetTreeNode(variable) # Make a new node corresponding to the data, ids. root_node.ids_var = ids[variable] # what's the "actual" variable corresponding to the ids list. root_node.depth = curr_depth rows_1 = (dataset[:, variable] == 1) # Boolean mask array, maybe used later rows_0 = (dataset[:, variable] == 0) root_node.rows1 = rows_1 # Set the rows in root_node, maybe used later. root_node.rows0 = rows_0 new_ids_list = np.delete(ids, variable, axis=0) # don't need the variable now. new_ids_mask = [True if i != variable else False for i in range(len(ids))] tmp_dataset_1 = dataset[rows_1][:, new_ids_mask] tmp_dataset_0 = dataset[rows_0][:, new_ids_mask] # new_dataset1 = np.delete(dataset[rows_1], variable, axis=1) # new_dataset0 = np.delete(dataset[rows_0], variable, axis=1) p1 = float(tmp_dataset_1.shape[0]) + 1.0 p0 = float(tmp_dataset_0.shape[0]) + 1.0 p0 = p0 / (p0+p1) # Normalize p1 = 1.0 - p0 root_node.prob0 = p0 # Set probs in root_node root_node.prob1 = p1 root_node.child0 = self.build_tree(tmp_dataset_0, new_ids_list) root_node.child1 = self.build_tree(tmp_dataset_1, new_ids_list) return root_node
def learn_structure_weight(self, dataset, weights, ids, smooth): curr_depth=self.nvariables-dataset.shape[1] if dataset.shape[0]<self.min_rec or dataset.shape[1]<self.min_var or curr_depth >= self.depth: clt = CLTree() clt.train(dataset) clt.prob_pair = np.zeros((1, 1, 2, 2)) clt.prob_sing = np.zeros((1, 2)) return clt pairs = utils_common.compute_pairwise_counts_weighted(dataset, weights) + smooth # Laplace correction pairs = utils_common.normalize2D(pairs) singles = utils_common.compute_single_counts_weighted(dataset, weights) + 2.0 * smooth # laplace correction singles = utils_common.normalize1D(singles) edgemat = utils_common.compute_adjmatrix(pairs, singles) np.fill_diagonal(edgemat, 0) scores = np.sum(edgemat, axis=0) variable = np.argmax(scores) index1 = np.where(dataset[:,variable]==1)[0] index0 = np.where(dataset[:,variable]==0)[0] new_dataset = np.delete(dataset, variable, axis = 1) new_dataset1 = new_dataset[index1] new_weights1 = weights[index1] p1= np.sum(new_weights1)+smooth new_dataset0 = new_dataset[index0] new_weights0 = weights[index0] p0 = np.sum(new_weights0)+smooth # Normalize p0 = p0/(p0+p1) p1 = 1.0 - p0 new_ids=np.delete(ids,variable,0) return [variable,ids[variable],p0,p1,self.learn_structure_weight(new_dataset0,new_weights0,new_ids, smooth), self.learn_structure_weight(new_dataset1,new_weights1, new_ids, smooth)]
def compute_approx_spantree(self, data, samp_k): nvars = data.shape[1] single_counts = utils.compute_single_counts(data) + 2 self.prob_sing = utils.normalize1D(single_counts) pairprob_arr = np.zeros((nvars, nvars, 2, 2)) parent = np.zeros(nvars, dtype=int) nodes = [i for i in range(nvars)] edge_count = 0 nodes_in_tree = [] # Already selected nodes for the tree curr_node = utils.select_nodes(nodes) nodes_in_tree.append(curr_node) nodes.remove(curr_node) parent[curr_node] = -9999 # Following the convention! Its a ROOT! while edge_count < nvars - 1: if len(nodes) <= samp_k: # Stop sampling! candidates = nodes else: candidates = utils.select_nodes(nodes, num=samp_k) best_node = self.give_best_candidate(data, curr_node, candidates) nodes_in_tree.append(best_node) nodes.remove(best_node) # remove from remaining list of nodes parent[best_node] = curr_node edge_count += 1 # print(curr_node, best_node, candidates, nodes, nodes_in_tree, edge_count) # Set the CPTs corresponding to this pair (curr_node, best_node) cpt_xy, cpt_yx = utils.compute_cpt_xy(data, curr_node, best_node) pairprob_arr[curr_node, best_node, :, :] = cpt_xy pairprob_arr[best_node, curr_node, :, :] = cpt_yx # Reset the curr_node curr_node = utils.select_nodes(nodes_in_tree) self.prob_pair = pairprob_arr return np.arange(nvars), parent