예제 #1
0
파일: osdt.py 프로젝트: zhouyuan1119/OSDT
def gini_reduction(x_mpz, y_mpz, ndata, rule_idx, points_cap=None):
    """
    calculate the gini reduction by each feature
    return the rank of by descending
    """

    if points_cap == None:
        points_cap = make_all_ones(ndata + 1)

    ndata0 = count_ones(points_cap)
    _, ndata01 = rule_vand(y_mpz, points_cap)

    p0 = ndata01 / ndata0
    gini0 = 2 * p0 * (1 - p0)

    gr = []
    for i in rule_idx:
        xi = x_mpz[i]
        l1_cap, ndata1 = rule_vand(points_cap, ~xi | mpz(pow(2, ndata)))

        _, ndata11 = rule_vand(l1_cap, y_mpz)

        l2_cap, ndata2 = rule_vand(points_cap, xi)

        _, ndata21 = rule_vand(l2_cap, y_mpz)

        p1 = ndata11 / ndata1 if ndata1 != 0 else 0
        p2 = ndata21 / ndata2 if ndata2 != 0 else 0
        gini1 = 2 * p1 * (1 - p1)
        gini2 = 2 * p2 * (1 - p2)
        gini_red = gini0 - ndata1 / ndata0 * gini1 - ndata2 / ndata0 * gini2
        gr.append(gini_red)

    gr = np.array(gr)
    order = list(gr.argsort()[::-1])

    odr = [rule_idx[r] for r in order]

    #print("ndata0:", ndata0)
    #print("ndata1:", ndata1)
    #print("ndata2:", ndata2)
    print("gr:", gr)
    print("order:", order)
    print("odr:", odr)
    #print("the rank of x's columns: ", rank)

    dic = dict(zip(np.array(rule_idx) + 1, odr))

    return odr, dic
예제 #2
0
파일: osdt.py 프로젝트: zhouyuan1119/OSDT
    def __init__(self, ndata, rules, y_mpz, z_mpz, points_cap, num_captured,
                 lamb, support, is_feature_dead):
        self.rules = rules
        self.points_cap = points_cap
        self.num_captured = num_captured
        self.is_feature_dead = is_feature_dead

        # the y's of these data captured by leaf antecedent[0]
        # y_leaf = y[tag]
        # print("tag",tag)
        # print("y",y)
        _, num_ones = rule_vand(points_cap, y_mpz)

        # b0 is defined in (28)

        _, num_errors = rule_vand(points_cap, z_mpz)
        self.B0 = num_errors / ndata

        if self.num_captured:
            self.prediction = int(num_ones / self.num_captured >= 0.5)
            if self.prediction == 1:
                self.num_captured_incorrect = self.num_captured - num_ones
            else:
                self.num_captured_incorrect = num_ones
            self.p = self.num_captured_incorrect / self.num_captured
        else:
            self.prediction = 0
            self.num_captured_incorrect = 0
            self.p = 0

        self.loss = float(self.num_captured_incorrect) / ndata

        # Lower bound on leaf support
        if support:
            # self.is_dead = self.num_captured / len(y) / 2 <= lamb
            self.is_dead = self.loss <= lamb
        else:
            self.is_dead = 0
예제 #3
0
파일: osdt.py 프로젝트: zhouyuan1119/OSDT
def bbound(x,
           y,
           lamb,
           prior_metric=None,
           MAXDEPTH=float('Inf'),
           MAX_NLEAVES=float('Inf'),
           niter=float('Inf'),
           logon=False,
           support=True,
           incre_support=True,
           accu_support=True,
           equiv_points=True,
           lookahead=True,
           lenbound=True,
           R_c0=1,
           timelimit=float('Inf'),
           init_cart=True,
           saveTree=False,
           readTree=False):
    """
    An implementation of Algorithm
    ## multiple copies of tree
    ## mark which leaves to be split
    """

    x0 = copy.deepcopy(x)
    y0 = copy.deepcopy(y)

    # Initialize best rule list and objective
    # d_c = None
    # R_c = 1

    tic = time.time()

    nrule = x.shape[1]
    ndata = len(y)
    max_nleaves = 2**nrule
    print("nrule:", nrule)
    print("ndata:", ndata)

    x_mpz = [rule_vectompz(x[:, i]) for i in range(nrule)]
    y_mpz = rule_vectompz(y)
    #print("x_mpz000",x_mpz)
    #print("y_mpz000", y_mpz)

    # order the columns by descending gini reduction
    idx, dic = gini_reduction(x_mpz, y_mpz, ndata, range(nrule))
    x = x[:, idx]
    x_mpz = [x_mpz[i] for i in idx]
    print("the order of x's columns: ", idx)
    #print("x_mpz111", x_mpz)
    #print("y_mpz111", y_mpz)
    """
    calculate z, which is for the equivalent points bound
    z is the vector defined in algorithm 5 of the CORELS paper
    z is a binary vector indicating the data with a minority lable in its equivalent set
    """
    z = pd.DataFrame([-1] * ndata).values
    # enumerate through theses samples
    for i in range(ndata):
        # if z[i,0]==-1, this sample i has not been put into its equivalent set
        if z[i, 0] == -1:
            tag1 = np.array([True] * ndata)
            for j in range(nrule):
                rule_label = x[i][j]
                # tag1 indicates which samples have exactly the same features with sample i
                tag1 = (x[:, j] == rule_label) * tag1

            y_l = y[tag1]
            pred = int(y_l.sum() / len(y_l) >= 0.5)
            # tag2 indicates the samples in a equiv set which have the minority label
            tag2 = (y_l != pred)
            z[tag1, 0] = tag2

    z_mpz = rule_vectompz(z.reshape(1, -1)[0])

    lines = []  # a list for log
    leaf_cache = {}  # cache leaves
    tree_cache = {}  # cache trees

    # initialize the queue to include just empty root
    queue = []
    root_leaf = CacheLeaf(ndata, (), y_mpz, z_mpz, make_all_ones(ndata + 1),
                          ndata, lamb, support, [0] * nrule)

    d_c = CacheTree(leaves=[root_leaf], lamb=lamb)
    R_c = d_c.risk

    tree0 = Tree(cache_tree=d_c,
                 lamb=lamb,
                 ndata=ndata,
                 splitleaf=[1],
                 prior_metric=prior_metric)

    heapq.heappush(queue, (tree0.metric, tree0))
    # heapq.heappush(queue, (2*tree0.metric - R_c, tree0))
    # queue.append(tree0)

    best_is_cart = False  # a flag for whether or not the best is the initial CART
    if init_cart:  # if warm start
        # CART
        clf = sklearn.tree.DecisionTreeClassifier(
            max_depth=None if MAXDEPTH == float('Inf') else MAXDEPTH,
            min_samples_split=max(math.ceil(lamb * 2 * len(y)), 2),
            min_samples_leaf=math.ceil(lamb * len(y)),
            max_leaf_nodes=math.floor(1 / (2 * lamb)),
            min_impurity_decrease=lamb)
        clf = clf.fit(x0, y0)

        nleaves_CART = (clf.tree_.node_count + 1) / 2
        trainaccu_CART = clf.score(x0, y0)

        R_c = 1 - trainaccu_CART + lamb * nleaves_CART
        d_c = clf

        C_c = 0
        time_c = time.time() - tic

        best_is_cart = True

    # read Tree from the preserved one, and only explore the children of the preserved one
    if readTree:
        with open('tree.pkl', 'rb') as f:
            d_c = pickle.load(f)
        R_c = d_c.risk

        with open('leaf_cache.pkl', 'rb') as f:
            leaf_cache = pickle.load(f)

        sorted_new_tree_rules = tuple(sorted(leaf.rules
                                             for leaf in d_c.leaves))
        tree_cache[sorted_new_tree_rules] = True

        tree_p = Tree(cache_tree=d_c,
                      lamb=lamb,
                      ndata=ndata,
                      splitleaf=[1] * len(d_c.leaves),
                      prior_metric=prior_metric)

        heapq.heappush(queue, (tree_p.metric, tree_p))
        print("PICKEL>>>>>>>>>>>>>", [leaf.rules for leaf in d_c.leaves])
        #print("leaf_cache:", leaf_cache)

        C_c = 0
        time_c = time.time() - tic

    if R_c0 < R_c:
        R_c = R_c0

    # log(lines, lamb, tic, len(queue), tuple(), tree0, R, d_c, R_c)

    leaf_cache[()] = root_leaf

    COUNT = 0  # count the total number of trees in the queue

    COUNT_POP = 0

    COUNT_UNIQLEAVES = 0
    COUNT_LEAFLOOKUPS = 0

    while queue and COUNT < niter and time.time() - tic < timelimit:
        # tree = queue.pop(0)
        metric, tree = heapq.heappop(queue)
        '''
        if prior_metric == "bound":
            if tree.lb + lamb*len(tree.splitleaf) >= R_c:
                break
        '''

        COUNT_POP = COUNT_POP + 1

        # print([leaf.rules for leaf in tree.leaves])
        # print("curio", curio)
        leaves = tree.cache_tree.leaves

        # print("=======COUNT=======",COUNT)
        # print("d",d)
        # print("R",tree.lbound[0]+(tree.num_captured_incorrect[0])/len(y))

        leaf_split = tree.splitleaf
        removed_leaves = list(compress(leaves, leaf_split))
        old_tree_length = len(leaf_split)
        new_tree_length = len(leaf_split) + sum(leaf_split)

        # prefix-specific upper bound on number of leaves
        if lenbound and new_tree_length >= min(
                old_tree_length + math.floor(
                    (R_c - tree.lb) / lamb), max_nleaves):
            #print("toolong===COUNT:", COUNT)
            continue

        n_removed_leaves = sum(leaf_split)
        n_unchanged_leaves = old_tree_length - n_removed_leaves

        # equivalent points bound combined with the lookahead bound
        lb = tree.lb
        b0 = sum([leaf.B0 for leaf in removed_leaves]) if equiv_points else 0
        lambbb = lamb if lookahead else 0
        if lb + b0 + n_removed_leaves * lambbb >= R_c:
            continue

        leaf_no_split = [not split for split in leaf_split]
        unchanged_leaves = list(compress(leaves, leaf_no_split))

        # lb = sum(l.loss for l in unchanged_leaves)
        # b0 = sum(l.b0 for l in removed_leaves)

        # Generate all assignments of rules to the leaves that are due to be split

        rules_for_leaf = [
            set(range(1, nrule + 1)) - set(map(abs, l.rules)) -
            set([i + 1 for i in range(nrule) if l.is_feature_dead[i] == 1])
            for l in removed_leaves
        ]

        for leaf_rules in product(*rules_for_leaf):

            if time.time() - tic >= timelimit:
                break

            new_leaves = []
            flag_increm = False  # a flag for jump out of the loops (incremental support bound)
            for rule, removed_leaf in zip(leaf_rules, removed_leaves):

                rule_index = rule - 1
                tag = removed_leaf.points_cap  # points captured by the leaf's parent leaf

                for new_rule in (-rule, rule):
                    new_rule_label = int(new_rule > 0)
                    new_rules = tuple(sorted(removed_leaf.rules +
                                             (new_rule, )))
                    if new_rules not in leaf_cache:

                        COUNT_UNIQLEAVES = COUNT_UNIQLEAVES + 1

                        tag_rule = x_mpz[
                            rule_index] if new_rule_label == 1 else ~(
                                x_mpz[rule_index]) | mpz(pow(2, ndata))
                        #print("x_mpz",x_mpz)
                        #print("tag_rule",tag_rule)
                        new_points_cap, new_num_captured = rule_vand(
                            tag, tag_rule)
                        # print("tag:", tag)
                        # print("tag_rule:", tag_rule)
                        # print("new_points_cap:", new_points_cap)
                        # print("new_num_captured:", new_num_captured)

                        #parent_is_feature_dead =
                        new_leaf = CacheLeaf(
                            ndata, new_rules, y_mpz, z_mpz, new_points_cap,
                            new_num_captured, lamb, support,
                            removed_leaf.is_feature_dead.copy())
                        leaf_cache[new_rules] = new_leaf
                        new_leaves.append(new_leaf)
                    else:

                        COUNT_LEAFLOOKUPS = COUNT_LEAFLOOKUPS + 1

                        new_leaf = leaf_cache[new_rules]
                        new_leaves.append(new_leaf)

                    # print("new_leaf:", new_leaf.rules)
                    # print("leaf loss:", new_leaf.loss)
                    # print("new_leaf.num_captured:",new_leaf.num_captured)
                    # print("new_leaf.num_captured_incorrect",new_leaf.num_captured_incorrect)

                    # print("******* old_rules:", removed_leaf.rules)
                    # print("******* new_rules:", new_rules)

                    # Lower bound on classification accuracy
                    # if (new_leaf.num_captured) / ndata <= lamb:
                    if accu_support == True and (
                            new_leaf.num_captured -
                            new_leaf.num_captured_incorrect) / ndata <= lamb:

                        removed_leaf.is_feature_dead[rule_index] = 1

                        flag_increm = True
                        break

                if flag_increm:
                    break

            if flag_increm:
                continue

            new_tree_leaves = unchanged_leaves + new_leaves

            sorted_new_tree_rules = tuple(
                sorted(leaf.rules for leaf in new_tree_leaves))

            if sorted_new_tree_rules in tree_cache:
                # print("====== New Tree Duplicated!!! ======")
                # print("sorted_new_tree_rules:", sorted_new_tree_rules)
                continue
            else:
                tree_cache[sorted_new_tree_rules] = True

            child = CacheTree(leaves=new_tree_leaves, lamb=lamb)

            R = child.risk
            # print("child:", child.sorted_leaves())
            # print("R:",R)
            if R < R_c:
                d_c = child
                R_c = R
                C_c = COUNT + 1
                time_c = time.time() - tic

                best_is_cart = False

            # generate the new splitleaf for the new tree
            sl = generate_new_splitleaf(unchanged_leaves, removed_leaves,
                                        new_leaves, lamb, R_c, incre_support)
            # print("sl:", sl)

            # A leaf cannot be split if
            # 1. the MAXDEPTH has been reached
            # 2. the leaf is dead (because of antecedent support)
            # 3. all the features that have not been used are dead
            cannot_split = [
                len(l.rules) >= MAXDEPTH or l.is_dead or all([
                    l.is_feature_dead[r - 1]
                    for r in range(1, nrule + 1) if r not in map(abs, l.rules)
                ]) for l in new_tree_leaves
            ]

            # if len(new_tree_leaves)!=new_tree_length:
            #    print("len(new_tree_leaves):",len(new_tree_leaves))
            #    print("new_tree_length:", new_tree_length)

            # For each copy, we don't split leaves which are not split in its parent tree.
            # In this way, we can avoid duplications.
            can_split_leaf = [(0,)] * n_unchanged_leaves + \
                             [(0,) if cannot_split[i]
                              else (0, 1) for i in range(n_unchanged_leaves, new_tree_length)]
            # Discard the first element of leaf_splits, since we must split at least one leaf
            new_leaf_splits0 = np.array(list(product(
                *can_split_leaf))[1:])  #sorted(product(*can_split_leaf))[1:]
            len_sl = len(sl)
            if len_sl == 1:
                # Filter out those which split at least one leaf in dp (d0)
                new_leaf_splits = [
                    ls for ls in new_leaf_splits0 if np.dot(ls, sl[0]) > 0
                ]
                # print("n_unchanged_leaves:",n_unchanged_leaves)
                # print("cannot_split:", cannot_split)
                # print("can_split_leaf:",can_split_leaf)
                # print("new_leaf_splits:",new_leaf_splits)
            else:
                # Filter out those which split at least one leaf in dp and split at least one leaf in d0
                new_leaf_splits = [
                    ls for ls in new_leaf_splits0
                    if all([np.dot(ls, sl[i]) > 0 for i in range(len_sl)])
                ]

            for new_leaf_split in new_leaf_splits:
                # construct the new tree
                tree_new = Tree(cache_tree=child,
                                ndata=ndata,
                                lamb=lamb,
                                splitleaf=new_leaf_split,
                                prior_metric=prior_metric)

                # MAX Number of leaves
                if len(new_leaf_split) + sum(new_leaf_split) > MAX_NLEAVES:
                    continue

                COUNT = COUNT + 1
                # heapq.heappush(queue, (2*tree_new.metric - R_c, tree_new))
                heapq.heappush(queue, (tree_new.metric, tree_new))

                if logon:
                    log(tic, lines, COUNT_POP, COUNT, queue, metric, R_c, tree,
                        tree_new, sorted_new_tree_rules)

                if COUNT % 1000000 == 0:
                    print("COUNT:", COUNT)

    totaltime = time.time() - tic

    if not best_is_cart:

        accu = 1 - (R_c - lamb * len(d_c.leaves))

        leaves_c = [leaf.rules for leaf in d_c.leaves]
        prediction_c = [leaf.prediction for leaf in d_c.leaves]

        num_captured = [leaf.num_captured for leaf in d_c.leaves]

        num_captured_incorrect = [
            leaf.num_captured_incorrect for leaf in d_c.leaves
        ]

        nleaves = len(leaves_c)
    else:
        accu = trainaccu_CART
        leaves_c = 'NA'
        prediction_c = 'NA'
        get_code(d_c, ['x' + str(i) for i in range(1, nrule + 1)], [0, 1])
        num_captured = 'NA'
        num_captured_incorrect = 'NA'
        nleaves = nleaves_CART

    if saveTree:
        with open('tree.pkl', 'wb') as f:
            pickle.dump(d_c, f)
        with open('leaf_cache.pkl', 'wb') as f:
            pickle.dump(leaf_cache, f)

    if logon:
        header = [
            'time', '#pop', '#push', 'queue_size', 'metric', 'R_c',
            'the_old_tree', 'the_old_tree_splitleaf', 'the_old_tree_objective',
            'the_old_tree_lbound', 'the_new_tree', 'the_new_tree_splitleaf',
            'the_new_tree_objective', 'the_new_tree_lbound',
            'the_new_tree_length', 'the_new_tree_depth', 'queue'
        ]

        fname = "_".join([
            str(nrule),
            str(ndata), prior_metric,
            str(lamb),
            str(MAXDEPTH),
            str(init_cart), ".txt"
        ])
        with open(fname, 'w') as f:
            f.write('%s\n' % ";".join(header))
            f.write('\n'.join(lines))

    print(">>> log:", logon)
    print(">>> support bound:", support)
    print(">>> accu_support:", accu_support)
    print(">>> accurate support bound:", incre_support)
    print(">>> equiv points bound:", equiv_points)
    print(">>> lookahead bound:", lookahead)
    print("prior_metric=", prior_metric)

    print("COUNT_UNIQLEAVES:", COUNT_UNIQLEAVES)
    print("COUNT_LEAFLOOKUPS:", COUNT_LEAFLOOKUPS)

    print("total time: ", totaltime)
    print("lambda: ", lamb)
    print("leaves: ", leaves_c)
    print("num_captured: ", num_captured)
    print("num_captured_incorrect: ", num_captured_incorrect)
    # print("lbound: ", d_c.cache_tree.lbound)
    # print("d_c.num_captured: ", [leaf.num_captured for leaf in d_c.cache_tree.leaves])
    print("prediction: ", prediction_c)
    print("Objective: ", R_c)
    print("Accuracy: ", accu)
    print("COUNT of the best tree: ", C_c)
    print("time when the best tree is achieved: ", time_c)
    print("TOTAL COUNT: ", COUNT)

    return leaves_c, prediction_c, dic, nleaves, nrule, ndata, totaltime, time_c, COUNT, C_c, accu, best_is_cart, clf
예제 #4
0
def bbound(x, y, name, lamb, prior_metric=None, w=None, theta=None, MAXDEPTH=float('Inf'), 
           MAX_NLEAVES=float('Inf'), niter=float('Inf'), logon=False,
           support=True, incre_support=True, accu_support=True, equiv_points=True,
           lookahead=True, lenbound=True, R_c0 = 1, timelimit=float('Inf'), init_cart = True,
           saveTree = False, readTree = False):

    x0 = copy.deepcopy(x)
    y0 = copy.deepcopy(y)

    tic = time.time()

    m = x.shape[1] # number of features
    n = len(y)
    P = np.count_nonzero(y)
    N = n-P

    x_mpz = [rule_vectompz(x[:, i]) for i in range(m)]
    y_mpz = rule_vectompz(y)

    # order the columns by descending gini reduction
    idx, dic = gini_reduction(x_mpz, y_mpz, n, range(m))
    #idx, dic = get_variable_importance(x, y)
    
    x = x[:, idx]
    x_mpz = [x_mpz[i] for i in idx]
    
    z_mpz = get_z(x,y,n,m)


    lines = []  # a list for log
    leaf_cache = {}  # cache leaves
    tree_cache = {}  # cache trees

    # initialize the queue to include just empty root
    queue = []
    root_leaf = CacheLeaf(name, n, P, N, (), x, y, y_mpz, z_mpz, make_all_ones(n + 1), 
                          n, lamb, support, [0] * m, w)
    d_c = CacheTree(name, P, N, lamb=lamb, leaves=[root_leaf], w=w, theta=theta)
    R_c = d_c.risk
    tree0 = Tree(cache_tree=d_c, n=n, lamb=lamb,splitleaf=[1], prior_metric=prior_metric)
    heapq.heappush(queue, (tree0.metric, tree0))
    
    best_is_cart = False  # a flag for whether or not the best is the initial CART
    if init_cart: 
        clf, nleaves_CART, trainout_CART, R_c, d_c, C_c = cart(x0, y0, name, n, P, N, lamb, w, theta, MAXDEPTH)
        time_c = time.time() - tic
        best_is_cart = True
        print('risk of cart:', R_c)
    else:
        C_c=0
        clf=None
        time_c = time.time()
        
    if readTree:
        with open('tree.pkl', 'rb') as f:
            d_c = pickle.load(f)
        R_c = d_c.risk

        with open('leaf_cache.pkl', 'rb') as f:
            leaf_cache = pickle.load(f)

        sorted_new_tree_rules = tuple(sorted(leaf.rules for leaf in d_c.leaves))
        tree_cache[sorted_new_tree_rules] = True

        tree_p = Tree(cache_tree=d_c, n=n, lamb=lamb, 
                      splitleaf=[1]*len(d_c.leaves), prior_metric=prior_metric)

        heapq.heappush(queue, (tree_p.metric, tree_p))
        '''
        print("PICKEL>>>>>>>>>>>>>", [leaf.rules for leaf in d_c.leaves])
        print('R_c:', R_c)
        print('lower_bound:', tree_p.lb)
        print('lookahead:',tree_p.lb+lamb*sum(tree_p.splitleaf))
        '''
        #print("leaf_cache:", leaf_cache)

        C_c = 0
        time_c = time.time() - tic
        
    if R_c0 < R_c:
        R_c = R_c0

    
    leaf_cache[()] = root_leaf

    COUNT = 0  # count the total number of trees in the queue
    COUNT_POP = 0 # number of tree poped from queue (# of tree checked)
    COUNT_UNIQLEAVES = 0
    COUNT_LEAFLOOKUPS = 0
    
    if logon:
        header = ['time', '#pop', '#push', 'queue_size', 'metric', 'R_c',
                  'the_old_tree', 'the_old_tree_splitleaf', 'the_old_tree_objective', 'the_old_tree_lbound',
                  'the_new_tree', 'the_new_tree_splitleaf',
                  'the_new_tree_objective', 'the_new_tree_lbound', 'the_new_tree_length', 'the_new_tree_depth', 'queue']

        fname = "_".join([name, str(m), str(n), prior_metric,
                          str(lamb), str(MAXDEPTH), str(init_cart), ".txt"])
        with open(fname, 'w') as f:
            f.write('%s\n' % ";".join(header))
    
    bound = Objective(name, P, N, lamb)
    
    #len_queue=[]
    #time_queue=[]
    #count_tree = []
    #time_realize_best_tree=[time_c]
    #R_best_tree=[R_c]
    #best_tree = [d_c]

    while queue and COUNT < niter and time.time() - tic < timelimit:
        '''
        print(len(queue))
        for metric, t in queue:
            print(metric, [l.rules for l in t.cache_tree.leaves], t.splitleaf)
        '''
        metric, tree = heapq.heappop(queue)
        

        COUNT_POP = COUNT_POP + 1
        #count_tree.append(COUNT_POP)
        
        leaves = tree.cache_tree.leaves
        leaf_split = tree.splitleaf       
        removed_leaves = list(compress(leaves, leaf_split))
        old_tree_length = len(leaf_split)
        new_tree_length = len(leaf_split) + sum(leaf_split)
        
        # prefix-specific upper bound on number of leaves
        if lenbound and new_tree_length >= min(old_tree_length + math.floor((R_c - tree.lb) / lamb),
                                               2**m):
            continue

        n_removed_leaves = sum(leaf_split)
        n_unchanged_leaves = old_tree_length - n_removed_leaves
        
        #print("num in queue:", len(queue))
        #print(time.time()-tic)
        #len_queue.append(len(queue))
        #time_queue.append(time.time()-tic)
        
        
        '''equivalent points bound + lookahead bound'''        
        lambbb = lamb if lookahead else 0
        
        if (name != 'auc_convex') and (name != 'partial_auc'):
            #for i in leaves:
                #print(rule_mpztovec(i.points_cap))
                #print('pred:',i.pred)
                #print('fp:',i.fp)
                #print('fn:',i.fn)
            
            
            FPu, FNu = get_fixed_false(leaves, leaf_split)
            if equiv_points:
                delta_fp, delta_fn = equiv_lb(name, leaves, leaf_split, P, N, lamb, w) 
                #print('delta_fp:', delta_fp)
                #print('delta_fn:', delta_fn)
            else:
                delta_fp=0
                delta_fn=0
            
            if (bound.loss(FPu+delta_fp, FNu+delta_fn, w)+ (old_tree_length+n_removed_leaves) * lambbb >= R_c):
                continue
        #    delta_fp = sum([leaf.delta_fp for leaf in removed_leaves]) if equiv_points else 0
        #    delta_fn = sum([leaf.delta_fn for leaf in removed_leaves]) if equiv_points else 0
            

        
        #if (name != "auc_convex") & (name != 'partial_auc'):
        #    delta_fp = sum([leaf.delta_fp for leaf in removed_leaves]) if equiv_points else 0
        #    delta_fn = sum([leaf.delta_fn for leaf in removed_leaves]) if equiv_points else 0
        #    FPu, FNu = get_fixed_false(leaves, leaf_split)
        
            #print("leaf:", [l.rules for l in leaves])
            #print("leaf fp:", [l.p for l in leaves])
            #print("leaf fn:", [l.n for l in leaves])
            #print("leaf delta fp:", [l.delta_fp for l in leaves])
            #print("leaf delta fn:", [l.delta_fn for l in leaves])
            #print((delta_fp+delta_fn)/(P+N))
            #print((FPu+FNu)/(P+N))
            #print(bound.loss(FPu+delta_fp, FNu+delta_fn, w))
            #print(n_removed_leaves * lambbb)
            #print("R_c:", R_c)
            #print(bound.loss(FPu+delta_fp, FNu+delta_fn, w) + (old_tree_length+n_removed_leaves) * lambbb, R_c)
            #print(bound.loss(FPu+delta_fp, FPu+delta_fn, w)+ n_removed_leaves * lambbb >= R_c)
        
        
        '''
        if (name != 'auc_convex') and (name != 'partial_auc'):
            #skip.append(bound.loss(FPu+delta_fp, FNu+delta_fn, w)+ (old_tree_length+n_removed_leaves) * lambbb >= R_c)
            print(bound.loss(FPu+delta_fp, FNu+delta_fn, w)+ (old_tree_length+n_removed_leaves) * lambbb >= R_c)
        if (name == 'auc_convex' or name == 'partial_auc'):
            #skip.append(tree.lb + n_removed_leaves * lambbb>= R_c)
            print(tree.lb + n_removed_leaves * lambbb>= R_c)
        '''
        
        #if (name != 'auc_convex') and (name != 'partial_auc') and \
        #(bound.loss(FPu+delta_fp, FNu+delta_fn, w)+ (old_tree_length+n_removed_leaves) * lambbb >= R_c):
        #    continue
        
        if (name == 'auc_convex'): 
            if (ach_equiv_lb(leaves, leaf_split, P, N, lamb) + n_removed_leaves*lambbb >= R_c):
                continue

        
        if (name == 'partial_auc') and (tree.lb + n_removed_leaves * lambbb>= R_c):
            continue

        leaf_no_split = [not split for split in leaf_split]
        unchanged_leaves = list(compress(leaves, leaf_no_split))

        # Generate all assignments of rules to the leaves that are due to be split
        rules_for_leaf = [set(range(1, m + 1)) - set(map(abs, l.rules)) -
                          set([i+1 for i in range(m) if l.is_feature_dead[i] == 1]) for l in removed_leaves]

        for leaf_rules in product(*rules_for_leaf):

            if time.time() - tic >= timelimit:
                break

            new_leaves = []
            flag_increm = False  # a flag for jump out of the loops (incremental support bound)
            for rule, removed_leaf in zip(leaf_rules, removed_leaves):

                rule_index = rule - 1
                tag = removed_leaf.points_cap  # points captured by the leaf's parent leaf

                for new_rule in (-rule, rule):
                    new_rule_label = int(new_rule > 0)
                    new_rules = tuple(
                        sorted(removed_leaf.rules + (new_rule,)))
                    if new_rules not in leaf_cache:

                        COUNT_UNIQLEAVES = COUNT_UNIQLEAVES+1

                        tag_rule = x_mpz[rule_index] if new_rule_label == 1 else ~(x_mpz[rule_index]) | mpz(pow(2, n))
                        new_points_cap, new_num_captured = rule_vand(tag, tag_rule)

                        #parent_is_feature_dead =
                        new_leaf = CacheLeaf(name, n, P, N, new_rules, x, y, y_mpz, z_mpz, new_points_cap, new_num_captured,
                                             lamb, support, removed_leaf.is_feature_dead.copy(), w)
                        leaf_cache[new_rules] = new_leaf
                        new_leaves.append(new_leaf)
                    else:

                        COUNT_LEAFLOOKUPS = COUNT_LEAFLOOKUPS+1

                        new_leaf = leaf_cache[new_rules]
                        new_leaves.append(new_leaf)

                    '''
                    # Lower bound on classification accuracy
                    # if (new_leaf.num_captured) / n <= lamb:
                    # accu_support == theorem 9 in OSDT, check if feature dead, not derived yet
                    
                    if accu_support == True and (new_leaf.num_captured - new_leaf.num_captured_incorrect) / n <= lamb:

                        removed_leaf.is_feature_dead[rule_index] = 1

                        flag_increm = True
                        break
                    '''    

                if flag_increm:
                    break

            if flag_increm:
                continue

            new_tree_leaves = unchanged_leaves + new_leaves

            sorted_new_tree_rules = tuple(sorted(leaf.rules for leaf in new_tree_leaves))

            if sorted_new_tree_rules in tree_cache:
                continue
            else:
                tree_cache[sorted_new_tree_rules] = True

            child = CacheTree(name, P, N, lamb, new_tree_leaves, w=w, theta=theta)
            
            #print([l.rules for l in child.leaves])

            R = child.risk
            
            #print("R:", R, "R_c:", R_c)
            #time_realize_best_tree.append(time.time()-tic)
            #R_best_tree.append(R)
            
            
            if R < R_c:
                d_c = child
                #best_tree.append([leaf.rules for leaf in d_c.leaves])
                #R_best_tree.append(R)
                #time_realize_best_tree.append(time.time()-tic)
                R_c = R
                C_c = COUNT + 1
                time_c = time.time() - tic
                

                best_is_cart = False

            # generate the new splitleaf for the new tree
            sl = generate_new_splitleaf(name, P, N, unchanged_leaves, removed_leaves, new_leaves,
                                        lamb, incre_support, w, theta) # a_j

            cannot_split = get_cannot_split(name, P, N, lamb, m, new_tree_leaves, 
                                            MAXDEPTH, w, theta)
                

            # For each copy, we don't split leaves which are not split in its parent tree.
            # In this way, we can avoid duplications.
            can_split_leaf = [(0,)] * n_unchanged_leaves + \
                             [(0,) if cannot_split[i]
                              else (0, 1) for i in range(n_unchanged_leaves, new_tree_length)]
            # Discard the first element of leaf_splits, since we must split at least one leaf
            new_leaf_splits0 = np.array(list(product(*can_split_leaf))[1:])#sorted(product(*can_split_leaf))[1:]
            len_sl = len(sl)
            if len_sl == 1:
                # Filter out those which split at least one leaf in dp (d0)
                new_leaf_splits = [ls for ls in new_leaf_splits0
                                   if np.dot(ls, sl[0]) > 0]
            else:
                # Filter out those which split at least one leaf in dp and split at least one leaf in d0
                new_leaf_splits = [ls for ls in new_leaf_splits0
                                   if all([np.dot(ls, sl[i]) > 0 for i in range(len_sl)])]

            for new_leaf_split in new_leaf_splits:
                # construct the new tree
                tree_new = Tree(cache_tree=child, n=n, lamb=lamb,
                                splitleaf=new_leaf_split, prior_metric=prior_metric)
                '''
                print('tree_lb:', round(tree_new.lb, 4), 
                      'tree_risk:', round(tree.cache_tree.risk, 4))
                '''
                #print('tree_rules_x8:', [l.rules for l in tree.cache_tree.leaves])
                
                
                # MAX Number of leaves
                if len(new_leaf_split)+sum(new_leaf_split) > MAX_NLEAVES:
                    continue

                COUNT = COUNT + 1
                #print([l.rules for l in tree_new.cache_tree.leaves], tree_new.splitleaf)
                '''
                if (COUNT <= 22):
                    print([l.rules for l in tree_new.cache_tree.leaves], 
                          tree_new.splitleaf, round(tree_new.lb, 4), 
                          round(tree_new.cache_tree.risk,4), round(tree_new.metric, 4), 
                          round(metric,4), [l.rules for l in tree.cache_tree.leaves])
                
                if (COUNT ==22)|(COUNT == 21)|(COUNT==20):
                    for metric, t in queue:
                        print(metric, [l.rules for l in t.cache_tree.leaves], t.splitleaf)
                   
                if COUNT == 22:
                    print('123455667677')
                    return
                '''
                # heapq.heappush(queue, (2*tree_new.metric - R_c, tree_new))
                heapq.heappush(queue, (tree_new.metric, tree_new))
                
                if logon:
                    log(tic, lines, COUNT_POP, COUNT, queue, metric, R_c, tree, tree_new, sorted_new_tree_rules, fname)
                
                if COUNT % 1000000 == 0:
                    print("COUNT:", COUNT)
        #print('COUNT:', COUNT)

    totaltime = time.time() - tic

    if not best_is_cart:

        accu = 1-(R_c-lamb*len(d_c.leaves))

        leaves_c = [leaf.rules for leaf in d_c.leaves]
        pred_c = [leaf.pred for leaf in d_c.leaves]

        num_captured = [leaf.num_captured for leaf in d_c.leaves]

        #num_captured_incorrect = [leaf.num_captured_incorrect for leaf in d_c.leaves]

        nleaves = len(leaves_c)
    else:
        accu = trainout_CART
        leaves_c = 'NA'
        pred_c = 'NA'
        get_code(d_c, ['x'+str(i) for i in range(1, m+1)], [0, 1])
        num_captured = 'NA'
        #num_captured_incorrect = 'NA'
        nleaves = nleaves_CART
        
    if saveTree:
        with open('tree.pkl', 'wb') as f:
            pickle.dump(d_c, f)
        with open('leaf_cache.pkl', 'wb') as f:
            pickle.dump(leaf_cache, f)
        

    '''
    print(">>> log:", logon)
    print(">>> support bound:", support)
    print(">>> accu_support:", accu_support)
    print(">>> accurate support bound:", incre_support)
    print(">>> equiv points bound:", equiv_points)
    print(">>> lookahead bound:", lookahead)
    print("prior_metric=", prior_metric)
    '''
    print("loss function:", name)
    print("lambda: ", lamb)
    print("COUNT_UNIQLEAVES:", COUNT_UNIQLEAVES)
    print("COUNT_LEAFLOOKUPS:", COUNT_LEAFLOOKUPS)
    print("total time: ", totaltime)   
    print("leaves: ", leaves_c)
    print("num_captured: ", num_captured)
    print("prediction: ", pred_c)
    print("Objective: ", R_c)
    print("Accuracy: ", accu)
    print("COUNT of the best tree: ", C_c)
    print("time when the best tree is achieved: ", time_c)
    print("TOTAL COUNT: ", COUNT)

    return leaves_c, pred_c, dic, nleaves, m, n, totaltime, time_c, R_c, COUNT, C_c, \
            accu, best_is_cart, clf#, len_queue, time_queue, \
예제 #5
0
    def __init__(self, name, n, P, N, rules, x, y, y_mpz, z_mpz, points_cap, 
                 num_captured, lamb, support, is_feature_dead, w=None):
        self.rules = rules
        self.points_cap = points_cap
        self.num_captured = num_captured
        self.is_feature_dead = is_feature_dead
        
        _, num_ones = rule_vand(points_cap, y_mpz) #return vand and cnt
        _, num_errors = rule_vand(points_cap, z_mpz)
        '''
        print('rules:', rules)
        print("points_cap:", points_cap, "vec:", rule_mpztovec(points_cap))
        print('_:', _, "vec:", rule_mpztovec(_))
        print('num_errors',num_errors)
        '''
        self.delta = num_errors
        self.p = num_ones
        self.n = self.num_captured - num_ones
        if self.num_captured > 0 :
            self.r = num_ones/self.num_captured
        else:
            self.r = 0
        bound = Objective(name, P, N, lamb)

        if name != 'partial_auc':
            if num_errors > 0:
                cap = np.array(rule_mpztovec(points_cap))
                cap_i = np.where(cap == 1)[0]
                x_cap = x[cap_i]
                y_cap = y[cap_i]
                
                v = rule_mpztovec(_)
                equiv_i = np.where(np.array(v) == 1)[0]
                idx = [i for i,c in enumerate(cap_i) if c in equiv_i]
                idx = np.array(idx)
                
                unique_rows, counts = np.unique(x_cap[idx,], axis=0, return_counts=True)
                '''
                print('cap_i:', cap_i)
                print("x_cap:", x_cap)
                print("y_cap:", y_cap)
                print("v:", v)
                print("idx:", idx)
                '''
                nrow = unique_rows.shape[0]
                self.equiv = np.zeros((3, nrow+2))
    
                for i in range(nrow):
                    comp = np.all(np.equal(x_cap, unique_rows[i,]), axis=1)
                    eu = np.sum(comp)
                    j = np.where(comp==True)
                    n_neg = np.sum(y_cap[j]==0)
                    n_pos = eu-n_neg
                    self.equiv[0,i] = n_pos/eu    #r = n_pos/eu
                    self.equiv[1,i] = n_pos
                    self.equiv[2,i] = n_neg
            
                self.equiv[0, nrow] = 1
                #y_i = np.where(np.array(v)==0)[0]
                #equiv_not_i = [i for i,c in enumerate(cap_i) if c not in equiv_i]
                self.equiv[1, nrow] = sum(y_cap==1) - sum(self.equiv[1,i] for i in range(nrow))
                self.equiv[2, nrow+1] = sum(y_cap==0) - sum(self.equiv[2,i] for i in range(nrow))
            else:
                self.equiv = np.zeros((3, 2))
                self.equiv[0,0] = 1
                self.equiv[1,0] = self.p
                self.equiv[2,1] = self.n
    
        if self.num_captured:
            self.pred = bound.leaf_predict(self.p, self.n, w)
            if self.pred == 0:
                self.fp = 0
                self.fn = self.p
                #self.delta_fp = 0
                #self.delta_fn = self.delta
            else:
                self.fp = self.n
                self.fn = 0
                #self.delta_fp = self.delta
                #self.delta_fn = 0
        else:
            self.pred = 0
            self.fp = 0
            self.fn = self.p