def gini_reduction(x_mpz, y_mpz, ndata, rule_idx, points_cap=None): """ calculate the gini reduction by each feature return the rank of by descending """ if points_cap == None: points_cap = make_all_ones(ndata + 1) ndata0 = count_ones(points_cap) _, ndata01 = rule_vand(y_mpz, points_cap) p0 = ndata01 / ndata0 gini0 = 2 * p0 * (1 - p0) gr = [] for i in rule_idx: xi = x_mpz[i] l1_cap, ndata1 = rule_vand(points_cap, ~xi | mpz(pow(2, ndata))) _, ndata11 = rule_vand(l1_cap, y_mpz) l2_cap, ndata2 = rule_vand(points_cap, xi) _, ndata21 = rule_vand(l2_cap, y_mpz) p1 = ndata11 / ndata1 if ndata1 != 0 else 0 p2 = ndata21 / ndata2 if ndata2 != 0 else 0 gini1 = 2 * p1 * (1 - p1) gini2 = 2 * p2 * (1 - p2) gini_red = gini0 - ndata1 / ndata0 * gini1 - ndata2 / ndata0 * gini2 gr.append(gini_red) gr = np.array(gr) order = list(gr.argsort()[::-1]) odr = [rule_idx[r] for r in order] #print("ndata0:", ndata0) #print("ndata1:", ndata1) #print("ndata2:", ndata2) print("gr:", gr) print("order:", order) print("odr:", odr) #print("the rank of x's columns: ", rank) dic = dict(zip(np.array(rule_idx) + 1, odr)) return odr, dic
def __init__(self, ndata, rules, y_mpz, z_mpz, points_cap, num_captured, lamb, support, is_feature_dead): self.rules = rules self.points_cap = points_cap self.num_captured = num_captured self.is_feature_dead = is_feature_dead # the y's of these data captured by leaf antecedent[0] # y_leaf = y[tag] # print("tag",tag) # print("y",y) _, num_ones = rule_vand(points_cap, y_mpz) # b0 is defined in (28) _, num_errors = rule_vand(points_cap, z_mpz) self.B0 = num_errors / ndata if self.num_captured: self.prediction = int(num_ones / self.num_captured >= 0.5) if self.prediction == 1: self.num_captured_incorrect = self.num_captured - num_ones else: self.num_captured_incorrect = num_ones self.p = self.num_captured_incorrect / self.num_captured else: self.prediction = 0 self.num_captured_incorrect = 0 self.p = 0 self.loss = float(self.num_captured_incorrect) / ndata # Lower bound on leaf support if support: # self.is_dead = self.num_captured / len(y) / 2 <= lamb self.is_dead = self.loss <= lamb else: self.is_dead = 0
def bbound(x, y, lamb, prior_metric=None, MAXDEPTH=float('Inf'), MAX_NLEAVES=float('Inf'), niter=float('Inf'), logon=False, support=True, incre_support=True, accu_support=True, equiv_points=True, lookahead=True, lenbound=True, R_c0=1, timelimit=float('Inf'), init_cart=True, saveTree=False, readTree=False): """ An implementation of Algorithm ## multiple copies of tree ## mark which leaves to be split """ x0 = copy.deepcopy(x) y0 = copy.deepcopy(y) # Initialize best rule list and objective # d_c = None # R_c = 1 tic = time.time() nrule = x.shape[1] ndata = len(y) max_nleaves = 2**nrule print("nrule:", nrule) print("ndata:", ndata) x_mpz = [rule_vectompz(x[:, i]) for i in range(nrule)] y_mpz = rule_vectompz(y) #print("x_mpz000",x_mpz) #print("y_mpz000", y_mpz) # order the columns by descending gini reduction idx, dic = gini_reduction(x_mpz, y_mpz, ndata, range(nrule)) x = x[:, idx] x_mpz = [x_mpz[i] for i in idx] print("the order of x's columns: ", idx) #print("x_mpz111", x_mpz) #print("y_mpz111", y_mpz) """ calculate z, which is for the equivalent points bound z is the vector defined in algorithm 5 of the CORELS paper z is a binary vector indicating the data with a minority lable in its equivalent set """ z = pd.DataFrame([-1] * ndata).values # enumerate through theses samples for i in range(ndata): # if z[i,0]==-1, this sample i has not been put into its equivalent set if z[i, 0] == -1: tag1 = np.array([True] * ndata) for j in range(nrule): rule_label = x[i][j] # tag1 indicates which samples have exactly the same features with sample i tag1 = (x[:, j] == rule_label) * tag1 y_l = y[tag1] pred = int(y_l.sum() / len(y_l) >= 0.5) # tag2 indicates the samples in a equiv set which have the minority label tag2 = (y_l != pred) z[tag1, 0] = tag2 z_mpz = rule_vectompz(z.reshape(1, -1)[0]) lines = [] # a list for log leaf_cache = {} # cache leaves tree_cache = {} # cache trees # initialize the queue to include just empty root queue = [] root_leaf = CacheLeaf(ndata, (), y_mpz, z_mpz, make_all_ones(ndata + 1), ndata, lamb, support, [0] * nrule) d_c = CacheTree(leaves=[root_leaf], lamb=lamb) R_c = d_c.risk tree0 = Tree(cache_tree=d_c, lamb=lamb, ndata=ndata, splitleaf=[1], prior_metric=prior_metric) heapq.heappush(queue, (tree0.metric, tree0)) # heapq.heappush(queue, (2*tree0.metric - R_c, tree0)) # queue.append(tree0) best_is_cart = False # a flag for whether or not the best is the initial CART if init_cart: # if warm start # CART clf = sklearn.tree.DecisionTreeClassifier( max_depth=None if MAXDEPTH == float('Inf') else MAXDEPTH, min_samples_split=max(math.ceil(lamb * 2 * len(y)), 2), min_samples_leaf=math.ceil(lamb * len(y)), max_leaf_nodes=math.floor(1 / (2 * lamb)), min_impurity_decrease=lamb) clf = clf.fit(x0, y0) nleaves_CART = (clf.tree_.node_count + 1) / 2 trainaccu_CART = clf.score(x0, y0) R_c = 1 - trainaccu_CART + lamb * nleaves_CART d_c = clf C_c = 0 time_c = time.time() - tic best_is_cart = True # read Tree from the preserved one, and only explore the children of the preserved one if readTree: with open('tree.pkl', 'rb') as f: d_c = pickle.load(f) R_c = d_c.risk with open('leaf_cache.pkl', 'rb') as f: leaf_cache = pickle.load(f) sorted_new_tree_rules = tuple(sorted(leaf.rules for leaf in d_c.leaves)) tree_cache[sorted_new_tree_rules] = True tree_p = Tree(cache_tree=d_c, lamb=lamb, ndata=ndata, splitleaf=[1] * len(d_c.leaves), prior_metric=prior_metric) heapq.heappush(queue, (tree_p.metric, tree_p)) print("PICKEL>>>>>>>>>>>>>", [leaf.rules for leaf in d_c.leaves]) #print("leaf_cache:", leaf_cache) C_c = 0 time_c = time.time() - tic if R_c0 < R_c: R_c = R_c0 # log(lines, lamb, tic, len(queue), tuple(), tree0, R, d_c, R_c) leaf_cache[()] = root_leaf COUNT = 0 # count the total number of trees in the queue COUNT_POP = 0 COUNT_UNIQLEAVES = 0 COUNT_LEAFLOOKUPS = 0 while queue and COUNT < niter and time.time() - tic < timelimit: # tree = queue.pop(0) metric, tree = heapq.heappop(queue) ''' if prior_metric == "bound": if tree.lb + lamb*len(tree.splitleaf) >= R_c: break ''' COUNT_POP = COUNT_POP + 1 # print([leaf.rules for leaf in tree.leaves]) # print("curio", curio) leaves = tree.cache_tree.leaves # print("=======COUNT=======",COUNT) # print("d",d) # print("R",tree.lbound[0]+(tree.num_captured_incorrect[0])/len(y)) leaf_split = tree.splitleaf removed_leaves = list(compress(leaves, leaf_split)) old_tree_length = len(leaf_split) new_tree_length = len(leaf_split) + sum(leaf_split) # prefix-specific upper bound on number of leaves if lenbound and new_tree_length >= min( old_tree_length + math.floor( (R_c - tree.lb) / lamb), max_nleaves): #print("toolong===COUNT:", COUNT) continue n_removed_leaves = sum(leaf_split) n_unchanged_leaves = old_tree_length - n_removed_leaves # equivalent points bound combined with the lookahead bound lb = tree.lb b0 = sum([leaf.B0 for leaf in removed_leaves]) if equiv_points else 0 lambbb = lamb if lookahead else 0 if lb + b0 + n_removed_leaves * lambbb >= R_c: continue leaf_no_split = [not split for split in leaf_split] unchanged_leaves = list(compress(leaves, leaf_no_split)) # lb = sum(l.loss for l in unchanged_leaves) # b0 = sum(l.b0 for l in removed_leaves) # Generate all assignments of rules to the leaves that are due to be split rules_for_leaf = [ set(range(1, nrule + 1)) - set(map(abs, l.rules)) - set([i + 1 for i in range(nrule) if l.is_feature_dead[i] == 1]) for l in removed_leaves ] for leaf_rules in product(*rules_for_leaf): if time.time() - tic >= timelimit: break new_leaves = [] flag_increm = False # a flag for jump out of the loops (incremental support bound) for rule, removed_leaf in zip(leaf_rules, removed_leaves): rule_index = rule - 1 tag = removed_leaf.points_cap # points captured by the leaf's parent leaf for new_rule in (-rule, rule): new_rule_label = int(new_rule > 0) new_rules = tuple(sorted(removed_leaf.rules + (new_rule, ))) if new_rules not in leaf_cache: COUNT_UNIQLEAVES = COUNT_UNIQLEAVES + 1 tag_rule = x_mpz[ rule_index] if new_rule_label == 1 else ~( x_mpz[rule_index]) | mpz(pow(2, ndata)) #print("x_mpz",x_mpz) #print("tag_rule",tag_rule) new_points_cap, new_num_captured = rule_vand( tag, tag_rule) # print("tag:", tag) # print("tag_rule:", tag_rule) # print("new_points_cap:", new_points_cap) # print("new_num_captured:", new_num_captured) #parent_is_feature_dead = new_leaf = CacheLeaf( ndata, new_rules, y_mpz, z_mpz, new_points_cap, new_num_captured, lamb, support, removed_leaf.is_feature_dead.copy()) leaf_cache[new_rules] = new_leaf new_leaves.append(new_leaf) else: COUNT_LEAFLOOKUPS = COUNT_LEAFLOOKUPS + 1 new_leaf = leaf_cache[new_rules] new_leaves.append(new_leaf) # print("new_leaf:", new_leaf.rules) # print("leaf loss:", new_leaf.loss) # print("new_leaf.num_captured:",new_leaf.num_captured) # print("new_leaf.num_captured_incorrect",new_leaf.num_captured_incorrect) # print("******* old_rules:", removed_leaf.rules) # print("******* new_rules:", new_rules) # Lower bound on classification accuracy # if (new_leaf.num_captured) / ndata <= lamb: if accu_support == True and ( new_leaf.num_captured - new_leaf.num_captured_incorrect) / ndata <= lamb: removed_leaf.is_feature_dead[rule_index] = 1 flag_increm = True break if flag_increm: break if flag_increm: continue new_tree_leaves = unchanged_leaves + new_leaves sorted_new_tree_rules = tuple( sorted(leaf.rules for leaf in new_tree_leaves)) if sorted_new_tree_rules in tree_cache: # print("====== New Tree Duplicated!!! ======") # print("sorted_new_tree_rules:", sorted_new_tree_rules) continue else: tree_cache[sorted_new_tree_rules] = True child = CacheTree(leaves=new_tree_leaves, lamb=lamb) R = child.risk # print("child:", child.sorted_leaves()) # print("R:",R) if R < R_c: d_c = child R_c = R C_c = COUNT + 1 time_c = time.time() - tic best_is_cart = False # generate the new splitleaf for the new tree sl = generate_new_splitleaf(unchanged_leaves, removed_leaves, new_leaves, lamb, R_c, incre_support) # print("sl:", sl) # A leaf cannot be split if # 1. the MAXDEPTH has been reached # 2. the leaf is dead (because of antecedent support) # 3. all the features that have not been used are dead cannot_split = [ len(l.rules) >= MAXDEPTH or l.is_dead or all([ l.is_feature_dead[r - 1] for r in range(1, nrule + 1) if r not in map(abs, l.rules) ]) for l in new_tree_leaves ] # if len(new_tree_leaves)!=new_tree_length: # print("len(new_tree_leaves):",len(new_tree_leaves)) # print("new_tree_length:", new_tree_length) # For each copy, we don't split leaves which are not split in its parent tree. # In this way, we can avoid duplications. can_split_leaf = [(0,)] * n_unchanged_leaves + \ [(0,) if cannot_split[i] else (0, 1) for i in range(n_unchanged_leaves, new_tree_length)] # Discard the first element of leaf_splits, since we must split at least one leaf new_leaf_splits0 = np.array(list(product( *can_split_leaf))[1:]) #sorted(product(*can_split_leaf))[1:] len_sl = len(sl) if len_sl == 1: # Filter out those which split at least one leaf in dp (d0) new_leaf_splits = [ ls for ls in new_leaf_splits0 if np.dot(ls, sl[0]) > 0 ] # print("n_unchanged_leaves:",n_unchanged_leaves) # print("cannot_split:", cannot_split) # print("can_split_leaf:",can_split_leaf) # print("new_leaf_splits:",new_leaf_splits) else: # Filter out those which split at least one leaf in dp and split at least one leaf in d0 new_leaf_splits = [ ls for ls in new_leaf_splits0 if all([np.dot(ls, sl[i]) > 0 for i in range(len_sl)]) ] for new_leaf_split in new_leaf_splits: # construct the new tree tree_new = Tree(cache_tree=child, ndata=ndata, lamb=lamb, splitleaf=new_leaf_split, prior_metric=prior_metric) # MAX Number of leaves if len(new_leaf_split) + sum(new_leaf_split) > MAX_NLEAVES: continue COUNT = COUNT + 1 # heapq.heappush(queue, (2*tree_new.metric - R_c, tree_new)) heapq.heappush(queue, (tree_new.metric, tree_new)) if logon: log(tic, lines, COUNT_POP, COUNT, queue, metric, R_c, tree, tree_new, sorted_new_tree_rules) if COUNT % 1000000 == 0: print("COUNT:", COUNT) totaltime = time.time() - tic if not best_is_cart: accu = 1 - (R_c - lamb * len(d_c.leaves)) leaves_c = [leaf.rules for leaf in d_c.leaves] prediction_c = [leaf.prediction for leaf in d_c.leaves] num_captured = [leaf.num_captured for leaf in d_c.leaves] num_captured_incorrect = [ leaf.num_captured_incorrect for leaf in d_c.leaves ] nleaves = len(leaves_c) else: accu = trainaccu_CART leaves_c = 'NA' prediction_c = 'NA' get_code(d_c, ['x' + str(i) for i in range(1, nrule + 1)], [0, 1]) num_captured = 'NA' num_captured_incorrect = 'NA' nleaves = nleaves_CART if saveTree: with open('tree.pkl', 'wb') as f: pickle.dump(d_c, f) with open('leaf_cache.pkl', 'wb') as f: pickle.dump(leaf_cache, f) if logon: header = [ 'time', '#pop', '#push', 'queue_size', 'metric', 'R_c', 'the_old_tree', 'the_old_tree_splitleaf', 'the_old_tree_objective', 'the_old_tree_lbound', 'the_new_tree', 'the_new_tree_splitleaf', 'the_new_tree_objective', 'the_new_tree_lbound', 'the_new_tree_length', 'the_new_tree_depth', 'queue' ] fname = "_".join([ str(nrule), str(ndata), prior_metric, str(lamb), str(MAXDEPTH), str(init_cart), ".txt" ]) with open(fname, 'w') as f: f.write('%s\n' % ";".join(header)) f.write('\n'.join(lines)) print(">>> log:", logon) print(">>> support bound:", support) print(">>> accu_support:", accu_support) print(">>> accurate support bound:", incre_support) print(">>> equiv points bound:", equiv_points) print(">>> lookahead bound:", lookahead) print("prior_metric=", prior_metric) print("COUNT_UNIQLEAVES:", COUNT_UNIQLEAVES) print("COUNT_LEAFLOOKUPS:", COUNT_LEAFLOOKUPS) print("total time: ", totaltime) print("lambda: ", lamb) print("leaves: ", leaves_c) print("num_captured: ", num_captured) print("num_captured_incorrect: ", num_captured_incorrect) # print("lbound: ", d_c.cache_tree.lbound) # print("d_c.num_captured: ", [leaf.num_captured for leaf in d_c.cache_tree.leaves]) print("prediction: ", prediction_c) print("Objective: ", R_c) print("Accuracy: ", accu) print("COUNT of the best tree: ", C_c) print("time when the best tree is achieved: ", time_c) print("TOTAL COUNT: ", COUNT) return leaves_c, prediction_c, dic, nleaves, nrule, ndata, totaltime, time_c, COUNT, C_c, accu, best_is_cart, clf
def bbound(x, y, name, lamb, prior_metric=None, w=None, theta=None, MAXDEPTH=float('Inf'), MAX_NLEAVES=float('Inf'), niter=float('Inf'), logon=False, support=True, incre_support=True, accu_support=True, equiv_points=True, lookahead=True, lenbound=True, R_c0 = 1, timelimit=float('Inf'), init_cart = True, saveTree = False, readTree = False): x0 = copy.deepcopy(x) y0 = copy.deepcopy(y) tic = time.time() m = x.shape[1] # number of features n = len(y) P = np.count_nonzero(y) N = n-P x_mpz = [rule_vectompz(x[:, i]) for i in range(m)] y_mpz = rule_vectompz(y) # order the columns by descending gini reduction idx, dic = gini_reduction(x_mpz, y_mpz, n, range(m)) #idx, dic = get_variable_importance(x, y) x = x[:, idx] x_mpz = [x_mpz[i] for i in idx] z_mpz = get_z(x,y,n,m) lines = [] # a list for log leaf_cache = {} # cache leaves tree_cache = {} # cache trees # initialize the queue to include just empty root queue = [] root_leaf = CacheLeaf(name, n, P, N, (), x, y, y_mpz, z_mpz, make_all_ones(n + 1), n, lamb, support, [0] * m, w) d_c = CacheTree(name, P, N, lamb=lamb, leaves=[root_leaf], w=w, theta=theta) R_c = d_c.risk tree0 = Tree(cache_tree=d_c, n=n, lamb=lamb,splitleaf=[1], prior_metric=prior_metric) heapq.heappush(queue, (tree0.metric, tree0)) best_is_cart = False # a flag for whether or not the best is the initial CART if init_cart: clf, nleaves_CART, trainout_CART, R_c, d_c, C_c = cart(x0, y0, name, n, P, N, lamb, w, theta, MAXDEPTH) time_c = time.time() - tic best_is_cart = True print('risk of cart:', R_c) else: C_c=0 clf=None time_c = time.time() if readTree: with open('tree.pkl', 'rb') as f: d_c = pickle.load(f) R_c = d_c.risk with open('leaf_cache.pkl', 'rb') as f: leaf_cache = pickle.load(f) sorted_new_tree_rules = tuple(sorted(leaf.rules for leaf in d_c.leaves)) tree_cache[sorted_new_tree_rules] = True tree_p = Tree(cache_tree=d_c, n=n, lamb=lamb, splitleaf=[1]*len(d_c.leaves), prior_metric=prior_metric) heapq.heappush(queue, (tree_p.metric, tree_p)) ''' print("PICKEL>>>>>>>>>>>>>", [leaf.rules for leaf in d_c.leaves]) print('R_c:', R_c) print('lower_bound:', tree_p.lb) print('lookahead:',tree_p.lb+lamb*sum(tree_p.splitleaf)) ''' #print("leaf_cache:", leaf_cache) C_c = 0 time_c = time.time() - tic if R_c0 < R_c: R_c = R_c0 leaf_cache[()] = root_leaf COUNT = 0 # count the total number of trees in the queue COUNT_POP = 0 # number of tree poped from queue (# of tree checked) COUNT_UNIQLEAVES = 0 COUNT_LEAFLOOKUPS = 0 if logon: header = ['time', '#pop', '#push', 'queue_size', 'metric', 'R_c', 'the_old_tree', 'the_old_tree_splitleaf', 'the_old_tree_objective', 'the_old_tree_lbound', 'the_new_tree', 'the_new_tree_splitleaf', 'the_new_tree_objective', 'the_new_tree_lbound', 'the_new_tree_length', 'the_new_tree_depth', 'queue'] fname = "_".join([name, str(m), str(n), prior_metric, str(lamb), str(MAXDEPTH), str(init_cart), ".txt"]) with open(fname, 'w') as f: f.write('%s\n' % ";".join(header)) bound = Objective(name, P, N, lamb) #len_queue=[] #time_queue=[] #count_tree = [] #time_realize_best_tree=[time_c] #R_best_tree=[R_c] #best_tree = [d_c] while queue and COUNT < niter and time.time() - tic < timelimit: ''' print(len(queue)) for metric, t in queue: print(metric, [l.rules for l in t.cache_tree.leaves], t.splitleaf) ''' metric, tree = heapq.heappop(queue) COUNT_POP = COUNT_POP + 1 #count_tree.append(COUNT_POP) leaves = tree.cache_tree.leaves leaf_split = tree.splitleaf removed_leaves = list(compress(leaves, leaf_split)) old_tree_length = len(leaf_split) new_tree_length = len(leaf_split) + sum(leaf_split) # prefix-specific upper bound on number of leaves if lenbound and new_tree_length >= min(old_tree_length + math.floor((R_c - tree.lb) / lamb), 2**m): continue n_removed_leaves = sum(leaf_split) n_unchanged_leaves = old_tree_length - n_removed_leaves #print("num in queue:", len(queue)) #print(time.time()-tic) #len_queue.append(len(queue)) #time_queue.append(time.time()-tic) '''equivalent points bound + lookahead bound''' lambbb = lamb if lookahead else 0 if (name != 'auc_convex') and (name != 'partial_auc'): #for i in leaves: #print(rule_mpztovec(i.points_cap)) #print('pred:',i.pred) #print('fp:',i.fp) #print('fn:',i.fn) FPu, FNu = get_fixed_false(leaves, leaf_split) if equiv_points: delta_fp, delta_fn = equiv_lb(name, leaves, leaf_split, P, N, lamb, w) #print('delta_fp:', delta_fp) #print('delta_fn:', delta_fn) else: delta_fp=0 delta_fn=0 if (bound.loss(FPu+delta_fp, FNu+delta_fn, w)+ (old_tree_length+n_removed_leaves) * lambbb >= R_c): continue # delta_fp = sum([leaf.delta_fp for leaf in removed_leaves]) if equiv_points else 0 # delta_fn = sum([leaf.delta_fn for leaf in removed_leaves]) if equiv_points else 0 #if (name != "auc_convex") & (name != 'partial_auc'): # delta_fp = sum([leaf.delta_fp for leaf in removed_leaves]) if equiv_points else 0 # delta_fn = sum([leaf.delta_fn for leaf in removed_leaves]) if equiv_points else 0 # FPu, FNu = get_fixed_false(leaves, leaf_split) #print("leaf:", [l.rules for l in leaves]) #print("leaf fp:", [l.p for l in leaves]) #print("leaf fn:", [l.n for l in leaves]) #print("leaf delta fp:", [l.delta_fp for l in leaves]) #print("leaf delta fn:", [l.delta_fn for l in leaves]) #print((delta_fp+delta_fn)/(P+N)) #print((FPu+FNu)/(P+N)) #print(bound.loss(FPu+delta_fp, FNu+delta_fn, w)) #print(n_removed_leaves * lambbb) #print("R_c:", R_c) #print(bound.loss(FPu+delta_fp, FNu+delta_fn, w) + (old_tree_length+n_removed_leaves) * lambbb, R_c) #print(bound.loss(FPu+delta_fp, FPu+delta_fn, w)+ n_removed_leaves * lambbb >= R_c) ''' if (name != 'auc_convex') and (name != 'partial_auc'): #skip.append(bound.loss(FPu+delta_fp, FNu+delta_fn, w)+ (old_tree_length+n_removed_leaves) * lambbb >= R_c) print(bound.loss(FPu+delta_fp, FNu+delta_fn, w)+ (old_tree_length+n_removed_leaves) * lambbb >= R_c) if (name == 'auc_convex' or name == 'partial_auc'): #skip.append(tree.lb + n_removed_leaves * lambbb>= R_c) print(tree.lb + n_removed_leaves * lambbb>= R_c) ''' #if (name != 'auc_convex') and (name != 'partial_auc') and \ #(bound.loss(FPu+delta_fp, FNu+delta_fn, w)+ (old_tree_length+n_removed_leaves) * lambbb >= R_c): # continue if (name == 'auc_convex'): if (ach_equiv_lb(leaves, leaf_split, P, N, lamb) + n_removed_leaves*lambbb >= R_c): continue if (name == 'partial_auc') and (tree.lb + n_removed_leaves * lambbb>= R_c): continue leaf_no_split = [not split for split in leaf_split] unchanged_leaves = list(compress(leaves, leaf_no_split)) # Generate all assignments of rules to the leaves that are due to be split rules_for_leaf = [set(range(1, m + 1)) - set(map(abs, l.rules)) - set([i+1 for i in range(m) if l.is_feature_dead[i] == 1]) for l in removed_leaves] for leaf_rules in product(*rules_for_leaf): if time.time() - tic >= timelimit: break new_leaves = [] flag_increm = False # a flag for jump out of the loops (incremental support bound) for rule, removed_leaf in zip(leaf_rules, removed_leaves): rule_index = rule - 1 tag = removed_leaf.points_cap # points captured by the leaf's parent leaf for new_rule in (-rule, rule): new_rule_label = int(new_rule > 0) new_rules = tuple( sorted(removed_leaf.rules + (new_rule,))) if new_rules not in leaf_cache: COUNT_UNIQLEAVES = COUNT_UNIQLEAVES+1 tag_rule = x_mpz[rule_index] if new_rule_label == 1 else ~(x_mpz[rule_index]) | mpz(pow(2, n)) new_points_cap, new_num_captured = rule_vand(tag, tag_rule) #parent_is_feature_dead = new_leaf = CacheLeaf(name, n, P, N, new_rules, x, y, y_mpz, z_mpz, new_points_cap, new_num_captured, lamb, support, removed_leaf.is_feature_dead.copy(), w) leaf_cache[new_rules] = new_leaf new_leaves.append(new_leaf) else: COUNT_LEAFLOOKUPS = COUNT_LEAFLOOKUPS+1 new_leaf = leaf_cache[new_rules] new_leaves.append(new_leaf) ''' # Lower bound on classification accuracy # if (new_leaf.num_captured) / n <= lamb: # accu_support == theorem 9 in OSDT, check if feature dead, not derived yet if accu_support == True and (new_leaf.num_captured - new_leaf.num_captured_incorrect) / n <= lamb: removed_leaf.is_feature_dead[rule_index] = 1 flag_increm = True break ''' if flag_increm: break if flag_increm: continue new_tree_leaves = unchanged_leaves + new_leaves sorted_new_tree_rules = tuple(sorted(leaf.rules for leaf in new_tree_leaves)) if sorted_new_tree_rules in tree_cache: continue else: tree_cache[sorted_new_tree_rules] = True child = CacheTree(name, P, N, lamb, new_tree_leaves, w=w, theta=theta) #print([l.rules for l in child.leaves]) R = child.risk #print("R:", R, "R_c:", R_c) #time_realize_best_tree.append(time.time()-tic) #R_best_tree.append(R) if R < R_c: d_c = child #best_tree.append([leaf.rules for leaf in d_c.leaves]) #R_best_tree.append(R) #time_realize_best_tree.append(time.time()-tic) R_c = R C_c = COUNT + 1 time_c = time.time() - tic best_is_cart = False # generate the new splitleaf for the new tree sl = generate_new_splitleaf(name, P, N, unchanged_leaves, removed_leaves, new_leaves, lamb, incre_support, w, theta) # a_j cannot_split = get_cannot_split(name, P, N, lamb, m, new_tree_leaves, MAXDEPTH, w, theta) # For each copy, we don't split leaves which are not split in its parent tree. # In this way, we can avoid duplications. can_split_leaf = [(0,)] * n_unchanged_leaves + \ [(0,) if cannot_split[i] else (0, 1) for i in range(n_unchanged_leaves, new_tree_length)] # Discard the first element of leaf_splits, since we must split at least one leaf new_leaf_splits0 = np.array(list(product(*can_split_leaf))[1:])#sorted(product(*can_split_leaf))[1:] len_sl = len(sl) if len_sl == 1: # Filter out those which split at least one leaf in dp (d0) new_leaf_splits = [ls for ls in new_leaf_splits0 if np.dot(ls, sl[0]) > 0] else: # Filter out those which split at least one leaf in dp and split at least one leaf in d0 new_leaf_splits = [ls for ls in new_leaf_splits0 if all([np.dot(ls, sl[i]) > 0 for i in range(len_sl)])] for new_leaf_split in new_leaf_splits: # construct the new tree tree_new = Tree(cache_tree=child, n=n, lamb=lamb, splitleaf=new_leaf_split, prior_metric=prior_metric) ''' print('tree_lb:', round(tree_new.lb, 4), 'tree_risk:', round(tree.cache_tree.risk, 4)) ''' #print('tree_rules_x8:', [l.rules for l in tree.cache_tree.leaves]) # MAX Number of leaves if len(new_leaf_split)+sum(new_leaf_split) > MAX_NLEAVES: continue COUNT = COUNT + 1 #print([l.rules for l in tree_new.cache_tree.leaves], tree_new.splitleaf) ''' if (COUNT <= 22): print([l.rules for l in tree_new.cache_tree.leaves], tree_new.splitleaf, round(tree_new.lb, 4), round(tree_new.cache_tree.risk,4), round(tree_new.metric, 4), round(metric,4), [l.rules for l in tree.cache_tree.leaves]) if (COUNT ==22)|(COUNT == 21)|(COUNT==20): for metric, t in queue: print(metric, [l.rules for l in t.cache_tree.leaves], t.splitleaf) if COUNT == 22: print('123455667677') return ''' # heapq.heappush(queue, (2*tree_new.metric - R_c, tree_new)) heapq.heappush(queue, (tree_new.metric, tree_new)) if logon: log(tic, lines, COUNT_POP, COUNT, queue, metric, R_c, tree, tree_new, sorted_new_tree_rules, fname) if COUNT % 1000000 == 0: print("COUNT:", COUNT) #print('COUNT:', COUNT) totaltime = time.time() - tic if not best_is_cart: accu = 1-(R_c-lamb*len(d_c.leaves)) leaves_c = [leaf.rules for leaf in d_c.leaves] pred_c = [leaf.pred for leaf in d_c.leaves] num_captured = [leaf.num_captured for leaf in d_c.leaves] #num_captured_incorrect = [leaf.num_captured_incorrect for leaf in d_c.leaves] nleaves = len(leaves_c) else: accu = trainout_CART leaves_c = 'NA' pred_c = 'NA' get_code(d_c, ['x'+str(i) for i in range(1, m+1)], [0, 1]) num_captured = 'NA' #num_captured_incorrect = 'NA' nleaves = nleaves_CART if saveTree: with open('tree.pkl', 'wb') as f: pickle.dump(d_c, f) with open('leaf_cache.pkl', 'wb') as f: pickle.dump(leaf_cache, f) ''' print(">>> log:", logon) print(">>> support bound:", support) print(">>> accu_support:", accu_support) print(">>> accurate support bound:", incre_support) print(">>> equiv points bound:", equiv_points) print(">>> lookahead bound:", lookahead) print("prior_metric=", prior_metric) ''' print("loss function:", name) print("lambda: ", lamb) print("COUNT_UNIQLEAVES:", COUNT_UNIQLEAVES) print("COUNT_LEAFLOOKUPS:", COUNT_LEAFLOOKUPS) print("total time: ", totaltime) print("leaves: ", leaves_c) print("num_captured: ", num_captured) print("prediction: ", pred_c) print("Objective: ", R_c) print("Accuracy: ", accu) print("COUNT of the best tree: ", C_c) print("time when the best tree is achieved: ", time_c) print("TOTAL COUNT: ", COUNT) return leaves_c, pred_c, dic, nleaves, m, n, totaltime, time_c, R_c, COUNT, C_c, \ accu, best_is_cart, clf#, len_queue, time_queue, \
def __init__(self, name, n, P, N, rules, x, y, y_mpz, z_mpz, points_cap, num_captured, lamb, support, is_feature_dead, w=None): self.rules = rules self.points_cap = points_cap self.num_captured = num_captured self.is_feature_dead = is_feature_dead _, num_ones = rule_vand(points_cap, y_mpz) #return vand and cnt _, num_errors = rule_vand(points_cap, z_mpz) ''' print('rules:', rules) print("points_cap:", points_cap, "vec:", rule_mpztovec(points_cap)) print('_:', _, "vec:", rule_mpztovec(_)) print('num_errors',num_errors) ''' self.delta = num_errors self.p = num_ones self.n = self.num_captured - num_ones if self.num_captured > 0 : self.r = num_ones/self.num_captured else: self.r = 0 bound = Objective(name, P, N, lamb) if name != 'partial_auc': if num_errors > 0: cap = np.array(rule_mpztovec(points_cap)) cap_i = np.where(cap == 1)[0] x_cap = x[cap_i] y_cap = y[cap_i] v = rule_mpztovec(_) equiv_i = np.where(np.array(v) == 1)[0] idx = [i for i,c in enumerate(cap_i) if c in equiv_i] idx = np.array(idx) unique_rows, counts = np.unique(x_cap[idx,], axis=0, return_counts=True) ''' print('cap_i:', cap_i) print("x_cap:", x_cap) print("y_cap:", y_cap) print("v:", v) print("idx:", idx) ''' nrow = unique_rows.shape[0] self.equiv = np.zeros((3, nrow+2)) for i in range(nrow): comp = np.all(np.equal(x_cap, unique_rows[i,]), axis=1) eu = np.sum(comp) j = np.where(comp==True) n_neg = np.sum(y_cap[j]==0) n_pos = eu-n_neg self.equiv[0,i] = n_pos/eu #r = n_pos/eu self.equiv[1,i] = n_pos self.equiv[2,i] = n_neg self.equiv[0, nrow] = 1 #y_i = np.where(np.array(v)==0)[0] #equiv_not_i = [i for i,c in enumerate(cap_i) if c not in equiv_i] self.equiv[1, nrow] = sum(y_cap==1) - sum(self.equiv[1,i] for i in range(nrow)) self.equiv[2, nrow+1] = sum(y_cap==0) - sum(self.equiv[2,i] for i in range(nrow)) else: self.equiv = np.zeros((3, 2)) self.equiv[0,0] = 1 self.equiv[1,0] = self.p self.equiv[2,1] = self.n if self.num_captured: self.pred = bound.leaf_predict(self.p, self.n, w) if self.pred == 0: self.fp = 0 self.fn = self.p #self.delta_fp = 0 #self.delta_fn = self.delta else: self.fp = self.n self.fn = 0 #self.delta_fp = self.delta #self.delta_fn = 0 else: self.pred = 0 self.fp = 0 self.fn = self.p