示例#1
0
    def learnStructureHelper(self, dataset, ids):
        curr_depth = self.nvariables - dataset.shape[1]
        #print ("curr_dept: ", curr_depth)
         
        # Termination condition satisfied, learn the CLTree as a leaf.
        if dataset.shape[0]<self.min_rec or dataset.shape[1]<self.min_var or curr_depth >= self.depth:
            clt = CLTree()
            clt.train(dataset)
            return clt
        
        pairs = utils_common.compute_pairwise_counts(dataset) + 1  # Laplace correction
        pairs = utils_common.normalize2D(pairs)
        singles = utils_common.compute_single_counts(dataset) + 2  # laplace correction
        singles = utils_common.normalize1D(singles)
        edgemat = utils_common.compute_adjmatrix(pairs, singles)
        np.fill_diagonal(edgemat, 0) #  #           
        scores = np.sum(edgemat, axis=0)
        variable = np.argmax(scores)    # this variable (number) will not correspond exactly to our global list of feature column numbers
        
        new_dataset1=np.delete(dataset[dataset[:,variable]==1],variable,1)
        p1=float(new_dataset1.shape[0])+1.0
        new_ids=np.delete(ids,variable,0)        
        new_dataset0 = np.delete(dataset[dataset[:, variable] == 0], variable, 1)
        p0 = float(new_dataset0.shape[0]) +1.0        
        # Normalize
        p0 = p0/(p0+p1)
        p1 = 1.0 - p0        

        return [variable,ids[variable],p0,p1,self.learnStructureHelper(new_dataset0,new_ids),
                self.learnStructureHelper(new_dataset1,new_ids)]
示例#2
0
 def compute_exact_graph(self, data):
     pairwise_counts = utils.compute_pairwise_counts(data) + 1
     self.prob_pair = utils.normalize2D(pairwise_counts)        
     single_counts = utils.compute_single_counts(data) + 2
     self.prob_sing = utils.normalize1D(single_counts)
     adjmat = utils.compute_adjmatrix(self.prob_pair, self.prob_sing)
     adjmat[adjmat == 0.0] = 1e-10   # complete graph!
     return adjmat
示例#3
0
 def compute_best_attr(self, dataset):
     pairs = utils.compute_pairwise_counts(dataset) + 1  # Laplace correction
     pairs = utils.normalize2D(pairs)
     singles = utils.compute_single_counts(dataset) + 2  # laplace correction
     singles = utils.normalize1D(singles)
     edgemat = utils.compute_adjmatrix(pairs, singles)
     np.fill_diagonal(edgemat, 0) #  #           
     scores = np.sum(edgemat, axis=0)
     variable = np.argmax(scores)    # this variable (number) will not correspond exactly to our global list of feature column numbers
     return variable
示例#4
0
    def train_weighted(self, weights, data):
        # weights is a np vector assigning weights to every data-vector in data
        N = data.shape[0]
        alpha = max(np.sum(weights), 1)
        alpha /= N        
        pairwise_counts = utils.compute_pairwise_counts_weighted(data, weights) + alpha
        self.prob_pair = utils.normalize2D(pairwise_counts)
        single_counts = utils.compute_single_counts_weighted(data, weights) + 2 * alpha
        self.prob_sing = utils.normalize1D(single_counts)

        adjmat = utils.compute_adjmatrix(self.prob_pair, self.prob_sing) 
        adjmat *= -1.0 # making negative for MST calc
        adjmat[adjmat == 0.0] = 1e-10       
        mstree = minimum_spanning_tree(csr_matrix(adjmat))
        self.node_order, self.parent = depth_first_order(mstree, 0, directed=False)
示例#5
0
    def compute_approx_graph(self, data, samp_k):
        nvars = data.shape[1]
        adjmat = np.zeros((nvars, nvars))        
        single_counts = utils.compute_single_counts(data) + 2
        self.prob_sing = utils.normalize1D(single_counts)        
        pairprob_arr = np.zeros((nvars, nvars, 2, 2))

        nodes = [i for i in range(nvars)]        
        curr_node = utils.select_nodes(nodes)        
        nodes_in_tree = [curr_node]
        nodes.remove(curr_node)
        steps = 0
        flag = 0  # all nodes visited?
        while nodes or steps < nvars-1:
        # while nodes:
            if flag == 1:
                # If all nodes exhausted, just select from all nodes except the current!                
                tmp_nodes = [i for i in range(nvars) if i != curr_node]
                candidates =  utils.select_nodes(tmp_nodes, num=samp_k)            
            elif len(nodes) <= samp_k:    # Stop sampling!
                candidates = nodes  
            else:            
                candidates = utils.select_nodes(nodes, num=samp_k)

            for can in candidates:
                score = utils.compute_mutualinfo(data, curr_node, can)
                adjmat[curr_node, can] = score
                adjmat[can, curr_node] = score
                # Set the CPTs corresponding to this pair (curr_node, can)
                cpt_xy, cpt_yx = utils.compute_cpt_xy(data, curr_node, can)
                pairprob_arr[curr_node, can, :, :] = cpt_xy
                pairprob_arr[can, curr_node, :, :] = cpt_yx

            # update remaining nodes list
            if flag == 0:   # not visited all the nodes
                nodes_in_tree.extend(candidates)
                nodes = [ele for ele in nodes if ele not in candidates]
            if not nodes:
                flag = 1                
            steps += 1
            # print(curr_node, nodes, candidates, nodes_in_tree, steps)
            # Reset the curr_node
            curr_node = utils.select_nodes(nodes_in_tree)

        self.prob_pair = pairprob_arr   # For inference!!
        adjmat += np.identity(nvars)
        return adjmat
示例#6
0
    def build_tree(self, dataset, ids):
        curr_depth = self.nvariables - dataset.shape[1]
        #print ("curr_dept: ", curr_depth)
         
        # Termination condition satisfied, learn the CLTree as a leaf.
        if dataset.shape[0]<self.min_rec or dataset.shape[1]<self.min_var or curr_depth >= self.depth:
            clt = CLTree()
            clt.train(dataset)
            leaf_node = CnetTreeNode(clt)    # for leaf node, the var = clt tree object, not a feature col num.
            return leaf_node
        
        pairs = utils_common.compute_pairwise_counts(dataset) + 1  # Laplace correction
        pairs = utils_common.normalize2D(pairs)
        singles = utils_common.compute_single_counts(dataset) + 2  # laplace correction
        singles = utils_common.normalize1D(singles)
        edgemat = utils_common.compute_adjmatrix(pairs, singles)
        np.fill_diagonal(edgemat, 0) #  #           
        scores = np.sum(edgemat, axis=0)
        variable = np.argmax(scores)    # this variable (number) will not correspond exactly to our global list of feature column numbers

        root_node = CnetTreeNode(variable)  # Make a new node corresponding to the data, ids.
        root_node.ids_var = ids[variable]   # what's the "actual" variable corresponding to the ids list.
        root_node.depth = curr_depth

        rows_1 = (dataset[:, variable] == 1)    # Boolean mask array, maybe used later
        rows_0 = (dataset[:, variable] == 0)        
        root_node.rows1 = rows_1 # Set the rows in root_node, maybe used later. 
        root_node.rows0 = rows_0

        new_ids_list = np.delete(ids, variable, axis=0)  # don't need the variable now. 
        new_ids_mask = [True if i  != variable else False for i in range(len(ids))]
        tmp_dataset_1 = dataset[rows_1][:, new_ids_mask]       
        tmp_dataset_0 = dataset[rows_0][:, new_ids_mask]
        # new_dataset1 = np.delete(dataset[rows_1], variable, axis=1)
        # new_dataset0 = np.delete(dataset[rows_0], variable, axis=1)               
        p1 = float(tmp_dataset_1.shape[0]) + 1.0                        
        p0 = float(tmp_dataset_0.shape[0]) + 1.0                
        p0 = p0 / (p0+p1)   # Normalize
        p1 = 1.0 - p0         
        root_node.prob0 = p0 # Set probs in root_node
        root_node.prob1 = p1
                
        root_node.child0 = self.build_tree(tmp_dataset_0, new_ids_list)
        root_node.child1 = self.build_tree(tmp_dataset_1, new_ids_list)
        
        return root_node
示例#7
0
    def learn_structure_weight(self, dataset, weights, ids, smooth):
        curr_depth=self.nvariables-dataset.shape[1]
        
        
        if dataset.shape[0]<self.min_rec or dataset.shape[1]<self.min_var or curr_depth >= self.depth:
            clt = CLTree()
            clt.train(dataset) 
            clt.prob_pair = np.zeros((1, 1, 2, 2))
            clt.prob_sing = np.zeros((1, 2))             
            return clt
        
        pairs = utils_common.compute_pairwise_counts_weighted(dataset, weights) + smooth  # Laplace correction
        pairs = utils_common.normalize2D(pairs)
        singles = utils_common.compute_single_counts_weighted(dataset, weights) + 2.0 * smooth  # laplace correction
        singles = utils_common.normalize1D(singles)
        edgemat = utils_common.compute_adjmatrix(pairs, singles)
  
        np.fill_diagonal(edgemat, 0)        
        scores = np.sum(edgemat, axis=0)
        variable = np.argmax(scores)        
        
        index1 = np.where(dataset[:,variable]==1)[0]
        index0 = np.where(dataset[:,variable]==0)[0]        

        new_dataset =  np.delete(dataset, variable, axis = 1)        
        new_dataset1 = new_dataset[index1]
        new_weights1 = weights[index1]
        p1= np.sum(new_weights1)+smooth
                
        new_dataset0 = new_dataset[index0]
        new_weights0 = weights[index0]
        p0 = np.sum(new_weights0)+smooth
        
        # Normalize
        p0 = p0/(p0+p1)
        p1 = 1.0 - p0        
        
        new_ids=np.delete(ids,variable,0)
        
        return [variable,ids[variable],p0,p1,self.learn_structure_weight(new_dataset0,new_weights0,new_ids, smooth),
                self.learn_structure_weight(new_dataset1,new_weights1, new_ids, smooth)]
示例#8
0
    def compute_approx_spantree(self, data, samp_k):        
        nvars = data.shape[1]
        single_counts = utils.compute_single_counts(data) + 2
        self.prob_sing = utils.normalize1D(single_counts)
        pairprob_arr = np.zeros((nvars, nvars, 2, 2))
        parent = np.zeros(nvars, dtype=int)        
        nodes = [i for i in range(nvars)]
        edge_count = 0
        nodes_in_tree = []  # Already selected nodes for the tree

        curr_node = utils.select_nodes(nodes)
        nodes_in_tree.append(curr_node)
        nodes.remove(curr_node)
        parent[curr_node] = -9999   # Following the convention! Its a ROOT!

        while edge_count < nvars - 1:
            if len(nodes) <= samp_k:    # Stop sampling!
                candidates = nodes  
            else:            
                candidates = utils.select_nodes(nodes, num=samp_k)

            best_node = self.give_best_candidate(data, curr_node, candidates)                        
            nodes_in_tree.append(best_node)
            nodes.remove(best_node) # remove from remaining list of nodes
            parent[best_node] = curr_node
            edge_count += 1            
            # print(curr_node, best_node, candidates, nodes, nodes_in_tree, edge_count)

            # Set the CPTs corresponding to this pair (curr_node, best_node)
            cpt_xy, cpt_yx = utils.compute_cpt_xy(data, curr_node, best_node)
            pairprob_arr[curr_node, best_node, :, :] = cpt_xy
            pairprob_arr[best_node, curr_node, :, :] = cpt_yx
            
            # Reset the curr_node        
            curr_node = utils.select_nodes(nodes_in_tree)

        self.prob_pair = pairprob_arr
        return np.arange(nvars), parent