def main_jt(): dataset_dir = sys.argv[2] data_name = sys.argv[4] train_name = dataset_dir + data_name + '.ts.data' valid_name = dataset_dir + data_name + '.valid.data' test_name = dataset_dir + data_name + '.test.data' data_train = np.loadtxt(train_name, delimiter=',', dtype=np.uint32) data_valid = np.loadtxt(valid_name, delimiter=',', dtype=np.uint32) data_test = np.loadtxt(test_name, delimiter=',', dtype=np.uint32) clt = CLT() clt.learnStructure(data_train) print 'clt testset loglikihood score: ', clt.computeLL( data_test) / data_test.shape[0] n_variable = data_train.shape[1] clt.get_log_cond_cpt() jt = JunctionTree() jt.learn_structure(clt.topo_order, clt.parents, clt.cond_cpt) evid_list = [] query_var = np.arange(n_variable) start = time.time() marginal = get_marginal_JT(jt, evid_list, query_var) print '------Marginals------' for i in xrange(query_var.shape[0]): print marginal[i] print 'running time for new: ', time.time() - start
def learnStructureHelper(self,tum, dataset, ids, lamda, beta_function, evid_list, data_ind, next_id = -1, next_weights = np.zeros(2)): curr_depth=self.nvariables - ids.shape[0] if len(evid_list) == 0: # the first run #alpha = 1.0 * lamda sub_dataset = dataset else: if data_ind.shape[0] == 0: sub_dataset = np.array([]) #alpha = 0.0 else: sub_dataset = dataset[data_ind,:][:, ids] #print (sub_dataset.shape) alpha = utilM.updata_coef(sub_dataset.shape[0], dataset.shape[0], lamda, beta_function) #if True: if next_id == -1: # tum part p_xy, p_x = tum.inference_jt(evid_list,ids) if alpha > 0: # dataset part xycounts = Util.compute_xycounts(sub_dataset) + 1 # laplace correction xcounts = Util.compute_xcounts(sub_dataset) + 2 # laplace correction p_xy_d = Util.normalize2d(xycounts) #print p_xy p_x_d = Util.normalize1d(xcounts) #print (p_xy) # leaf node p_xy = alpha * p_xy_d + (1-alpha) * p_xy p_x = alpha * p_x_d + (1-alpha) * p_x # compute mutual information score for all pairs of variables # weights are multiplied by -1.0 because we compute the minimum spanning tree edgemat = Util.compute_MI_prob(p_xy, p_x) # reset self mutual information to be 0 np.fill_diagonal(edgemat, 0) #for i in xrange(self.nvariables): #print (edgemat[i,i]) #print ("edgemat: ", edgemat) scores = np.sum(edgemat, axis=0) #print (scores) variable = np.argmax(scores) #variable = 7 ####test variable_id = ids[variable] # the index in the original file p1 =p_x[variable,1] p0 =p_x[variable,0] evid_list.append(np.array([variable_id, -1])) # -1 means not determined yet if curr_depth >= self.depth: clt_leaf=CLT() clt_leaf.learnStructure_MI(edgemat) #edgemat = None # Save memory clt_leaf.xyprob = p_xy clt_leaf.xprob = p_x clt_leaf.get_log_cond_cpt() # 0809 # Try to save the memory clt_leaf.xyprob = np.zeros((1, 1, 2, 2)) # 0809 save_info = {} save_info['ids'] = ids save_info['next_id'] = variable_id save_info['next_weights'] = np.array([p0,p1]) save_info['evid_list'] = evid_list save_info['data_ind'] = data_ind clt_leaf.save_info = save_info return clt_leaf else: variable_id = next_id p0 = next_weights[0] p1 = next_weights[1] variable = np.where(ids==variable_id)[0][0] evid_list_0 = copy.deepcopy(evid_list) evid_list_1 = copy.deepcopy(evid_list) evid_list_0[-1][1] = 0 evid_list_1[-1][1] = 1 new_ids=np.delete(ids,variable) if alpha> 0: #print ('0+1', data_ind.shape[0]) new_data_ind0 = data_ind[np.where(sub_dataset[:,variable] ==0)[0]] #print ('0:',new_data_ind0.shape[0]) new_data_ind1 = data_ind[np.where(sub_dataset[:,variable] ==1)[0]] #print ('1:',new_data_ind1.shape[0]) else: new_data_ind0 = np.array([]) new_data_ind1 = np.array([]) new_ids=np.delete(ids,variable) #print ("p0, p1: ", p0, p1) #return [variable,variable_id,p0,p1,self.learnStructureHelper(tum, new_ids, evid_list_0), # self.learnStructureHelper(tum, new_ids, evid_list_1)] return [variable,variable_id,p0,p1,self.learnStructureHelper(tum, dataset, new_ids, lamda, beta_function, evid_list_0, new_data_ind0), self.learnStructureHelper(tum,dataset, new_ids, lamda, beta_function, evid_list_1, new_data_ind1)]