def learnStructureHelper(self,dataset,ids): curr_depth=self.nvariables-dataset.shape[1] #print ("curr_dept: ", curr_depth) if dataset.shape[0]<self.min_rec or dataset.shape[1]<self.min_var or curr_depth >= self.depth: clt=CLT() clt.learnStructure(dataset) return clt xycounts = Util.compute_xycounts(dataset) + 1 # laplace correction xcounts = Util.compute_xcounts(dataset) + 2 # laplace correction # compute mutual information score for all pairs of variables # weights are multiplied by -1.0 because we compute the minimum spanning tree edgemat = Util.compute_edge_weights(xycounts, xcounts) np.fill_diagonal(edgemat, 0) # # scores = np.sum(edgemat, axis=0) variable = np.argmax(scores) new_dataset1=np.delete(dataset[dataset[:,variable]==1],variable,1) p1=float(new_dataset1.shape[0])+1.0 new_ids=np.delete(ids,variable,0) new_dataset0 = np.delete(dataset[dataset[:, variable] == 0], variable, 1) p0 = float(new_dataset0.shape[0]) +1.0 return [variable,ids[variable],p0,p1,self.learnStructureHelper(new_dataset0,new_ids), self.learnStructureHelper(new_dataset1,new_ids)]
def load_mt(in_dir, data_name): infile = in_dir + data_name + '.npz' #out_file = '../module/' + data_name + '.npz' reload_dict = np.load(infile) reload_mix_clt = MIXTURE_CLT() reload_mix_clt.mixture_weight = reload_dict['weights'] reload_mix_clt.n_components = reload_mix_clt.mixture_weight.shape[0] reload_clt_component = reload_dict['clt_component'] #print (reload_clt_component) for i in xrange(reload_mix_clt.n_components): clt_c = CLT() #str_id = str(i) curr_component = reload_clt_component[i] clt_c.xyprob = curr_component['xyprob'] clt_c.xprob = curr_component['xprob'] clt_c.topo_order = curr_component['topo_order'] clt_c.parents = curr_component['parents'] clt_c.log_cond_cpt = curr_component['log_cond_cpt'] clt_c.cond_cpt = np.exp(clt_c.log_cond_cpt) #deep reload_mix_clt.clt_list.append(clt_c) return reload_mix_clt
def learnStructureP_Helper(self, dataset, ids, portion): curr_depth = self.nvariables - dataset.shape[1] #print ("curr_dept: ", curr_depth) if dataset.shape[0] < self.min_rec or dataset.shape[ 1] < self.min_var or curr_depth >= self.depth: clt = CLT() clt.learnStructure(dataset) return clt xycounts = Util.compute_xycounts(dataset) + 1 # laplace correction xcounts = Util.compute_xcounts(dataset) + 2 # laplace correction # compute mutual information score for all pairs of variables # weights are multiplied by -1.0 because we compute the minimum spanning tree edgemat = Util.compute_edge_weights(xycounts, xcounts) np.fill_diagonal(edgemat, 0) # # #print ("edgemat: ", edgemat) scores = np.sum(edgemat, axis=0) #print (scores) ind_portion = np.random.choice(ids.shape[0], int(ids.shape[0] * portion), replace=False) #print 'ind_portion: ', ind_portion scores_portion = scores[ind_portion] #print 'scores_portion: ', scores_portion #print np.argmax(scores_portion) variable = ind_portion[np.argmax(scores_portion)] new_dataset1 = np.delete(dataset[dataset[:, variable] == 1], variable, 1) p1 = float(new_dataset1.shape[0]) + 1.0 new_ids = np.delete(ids, variable, 0) #print ("new_ids: ", new_ids) new_dataset0 = np.delete(dataset[dataset[:, variable] == 0], variable, 1) p0 = float(new_dataset0.shape[0]) + 1.0 #print ("p0, p1:", float(p0)/(p0+p1), float(p1)/(p0+p1)) return [ variable, ids[variable], p0, p1, self.learnStructureP_Helper(new_dataset0, new_ids, portion), self.learnStructureP_Helper(new_dataset1, new_ids, portion) ]
def main_jt(): dataset_dir = sys.argv[2] data_name = sys.argv[4] train_name = dataset_dir + data_name + '.ts.data' valid_name = dataset_dir + data_name + '.valid.data' test_name = dataset_dir + data_name + '.test.data' data_train = np.loadtxt(train_name, delimiter=',', dtype=np.uint32) data_valid = np.loadtxt(valid_name, delimiter=',', dtype=np.uint32) data_test = np.loadtxt(test_name, delimiter=',', dtype=np.uint32) clt = CLT() clt.learnStructure(data_train) print 'clt testset loglikihood score: ', clt.computeLL( data_test) / data_test.shape[0] n_variable = data_train.shape[1] clt.get_log_cond_cpt() jt = JunctionTree() jt.learn_structure(clt.topo_order, clt.parents, clt.cond_cpt) evid_list = [] query_var = np.arange(n_variable) start = time.time() marginal = get_marginal_JT(jt, evid_list, query_var) print '------Marginals------' for i in xrange(query_var.shape[0]): print marginal[i] print 'running time for new: ', time.time() - start
def learnStructure(self, dataset, n_components): #print ("Mixture of Chow-Liu Tree ......" ) # Shuffle the dataset self.n_components = n_components self.mixture_weight = np.full(n_components, 1.0 / n_components) #print ("mixture weights: ", self.mixture_weight) data_shuffle = np.copy(dataset) np.random.shuffle(data_shuffle) n_data = data_shuffle.shape[0] / self.n_components for c in xrange(self.n_components): if c == self.n_components - 1: # the last portion data_slice = data_shuffle[c * n_data:, :] else: data_slice = data_shuffle[c * n_data:((c + 1) * n_data), :] clt = CLT() clt.learnStructure(data_slice) self.clt_list.append(clt)
def learn_structure_weight(self, dataset, weights, ids, smooth): curr_depth = self.nvariables - dataset.shape[1] if dataset.shape[0] < self.min_rec or dataset.shape[ 1] < self.min_var or curr_depth >= self.depth: clt = CLT() clt.learnStructure(dataset) clt.xyprob = np.zeros((1, 1, 2, 2)) clt.xprob = np.zeros((1, 2)) return clt self.xycounts = Util.compute_weighted_xycounts(dataset, weights) + smooth self.xcounts = Util.compute_weighted_xcounts(dataset, weights) + 2.0 * smooth edgemat = Util.compute_edge_weights(self.xycounts, self.xcounts) #edgemat[edgemat == 0.0] = 1e-20 np.fill_diagonal(edgemat, 0) scores = np.sum(edgemat, axis=0) #print (scores) variable = np.argmax(scores) #print ("variable: ", ids[variable]) index1 = np.where(dataset[:, variable] == 1)[0] index0 = np.where(dataset[:, variable] == 0)[0] #index0 = np.setdiff1d(np.arange(dataset.shape[0]), index1) new_dataset = np.delete(dataset, variable, axis=1) new_dataset1 = new_dataset[index1] new_weights1 = weights[index1] p1 = np.sum(new_weights1) + smooth #print ("new_ids: ", new_ids) new_dataset0 = new_dataset[index0] new_weights0 = weights[index0] p0 = np.sum(new_weights0) + smooth # Normalize p0 = p0 / (p0 + p1) p1 = 1.0 - p0 #print p0, p1 new_ids = np.delete(ids, variable, 0) #print ("p0, p1:", float(p0)/(p0+p1), float(p1)/(p0+p1)) return [ variable, ids[variable], p0, p1, self.learn_structure_weight(new_dataset0, new_weights0, new_ids, smooth), self.learn_structure_weight(new_dataset1, new_weights1, new_ids, smooth) ]
def learnStructureHelper(self,tum, dataset, ids, lamda, beta_function, evid_list, data_ind, next_id = -1, next_weights = np.zeros(2)): curr_depth=self.nvariables - ids.shape[0] if len(evid_list) == 0: # the first run #alpha = 1.0 * lamda sub_dataset = dataset else: if data_ind.shape[0] == 0: sub_dataset = np.array([]) #alpha = 0.0 else: sub_dataset = dataset[data_ind,:][:, ids] #print (sub_dataset.shape) alpha = utilM.updata_coef(sub_dataset.shape[0], dataset.shape[0], lamda, beta_function) #if True: if next_id == -1: # tum part p_xy, p_x = tum.inference_jt(evid_list,ids) if alpha > 0: # dataset part xycounts = Util.compute_xycounts(sub_dataset) + 1 # laplace correction xcounts = Util.compute_xcounts(sub_dataset) + 2 # laplace correction p_xy_d = Util.normalize2d(xycounts) #print p_xy p_x_d = Util.normalize1d(xcounts) #print (p_xy) # leaf node p_xy = alpha * p_xy_d + (1-alpha) * p_xy p_x = alpha * p_x_d + (1-alpha) * p_x # compute mutual information score for all pairs of variables # weights are multiplied by -1.0 because we compute the minimum spanning tree edgemat = Util.compute_MI_prob(p_xy, p_x) # reset self mutual information to be 0 np.fill_diagonal(edgemat, 0) #for i in xrange(self.nvariables): #print (edgemat[i,i]) #print ("edgemat: ", edgemat) scores = np.sum(edgemat, axis=0) #print (scores) variable = np.argmax(scores) #variable = 7 ####test variable_id = ids[variable] # the index in the original file p1 =p_x[variable,1] p0 =p_x[variable,0] evid_list.append(np.array([variable_id, -1])) # -1 means not determined yet if curr_depth >= self.depth: clt_leaf=CLT() clt_leaf.learnStructure_MI(edgemat) #edgemat = None # Save memory clt_leaf.xyprob = p_xy clt_leaf.xprob = p_x clt_leaf.get_log_cond_cpt() # 0809 # Try to save the memory clt_leaf.xyprob = np.zeros((1, 1, 2, 2)) # 0809 save_info = {} save_info['ids'] = ids save_info['next_id'] = variable_id save_info['next_weights'] = np.array([p0,p1]) save_info['evid_list'] = evid_list save_info['data_ind'] = data_ind clt_leaf.save_info = save_info return clt_leaf else: variable_id = next_id p0 = next_weights[0] p1 = next_weights[1] variable = np.where(ids==variable_id)[0][0] evid_list_0 = copy.deepcopy(evid_list) evid_list_1 = copy.deepcopy(evid_list) evid_list_0[-1][1] = 0 evid_list_1[-1][1] = 1 new_ids=np.delete(ids,variable) if alpha> 0: #print ('0+1', data_ind.shape[0]) new_data_ind0 = data_ind[np.where(sub_dataset[:,variable] ==0)[0]] #print ('0:',new_data_ind0.shape[0]) new_data_ind1 = data_ind[np.where(sub_dataset[:,variable] ==1)[0]] #print ('1:',new_data_ind1.shape[0]) else: new_data_ind0 = np.array([]) new_data_ind1 = np.array([]) new_ids=np.delete(ids,variable) #print ("p0, p1: ", p0, p1) #return [variable,variable_id,p0,p1,self.learnStructureHelper(tum, new_ids, evid_list_0), # self.learnStructureHelper(tum, new_ids, evid_list_1)] return [variable,variable_id,p0,p1,self.learnStructureHelper(tum, dataset, new_ids, lamda, beta_function, evid_list_0, new_data_ind0), self.learnStructureHelper(tum,dataset, new_ids, lamda, beta_function, evid_list_1, new_data_ind1)]