Python stop_split示例

编程语言: Python

命名空间/包名称: bart_utils

方法/功能: stop_split

hotexamples.com的示例: 4

Python stop_split - 已找到4个示例。这些是从开源项目中提取的最受好评的bart_utils.stop_split现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： treemcmc.py 项目： mazphilip/pgbart

 def evaluate_new_subtree(self, data, node_id_start, param, nodes_subtree,
                          cache, settings):
     for i in self.train_ids[node_id_start]:
         x_, y_ = data['x_train'][i, :], data['y_train'][i]
         node_id = copy(node_id_start)
         while True:
             self.sum_y_new[node_id] += y_
             self.sum_y2_new[node_id] += y_**2
             self.n_points_new[node_id] += 1
             self.train_ids_new[node_id] = np.append(
                 self.train_ids_new[node_id], i)
             if node_id in self.leaf_nodes:
                 break
             left, right = get_children_id(node_id)
             feat_id, split, idx_split_global = self.node_info_new[
                 node_id]  # splitting on new criteria
             if x_[feat_id] <= split:
                 node_id = left
             else:
                 node_id = right
     for node_id in nodes_subtree:
         self.loglik_new[node_id] = -np.inf
         if self.n_points_new[node_id] > 0:
             self.loglik_new[node_id], self.param_n_new[node_id] = \
                     compute_normal_normalizer(self.sum_y_new[node_id], self.sum_y2_new[node_id], \
                             self.n_points_new[node_id], param, cache, settings)
         if node_id in self.leaf_nodes:
             if stop_split(self.train_ids_new[node_id], settings, data,
                           cache):
                 # if leaf is empty, logprior_new[node_id] = 0.0 is incorrect; however
                 #      loglik_new[node_id] = -np.inf will reject move to a tree with empty leaves
                 self.logprior_new[node_id] = 0.0
             else:
                 # node with just 1 data point earlier could have more data points now
                 self.logprior_new[node_id] = np.log(
                     self.compute_pnosplit(node_id, param))
         else:
             # split probability might have changed if train_ids have changed
             self.recompute_prob_split(data, param, settings, cache,
                                       node_id)
     if settings.debug == 1:
         try:
             check_if_zero(self.loglik[node_id_start] -
                           self.loglik_new[node_id_start])
         except AssertionError:
             print('train_ids[node_id_start] = %s, train_ids_new[node_id_start] = %s' \
                     % (self.train_ids[node_id_start], self.train_ids_new[node_id_start]))
             raise AssertionError

示例#2

显示文件

文件： treemcmc.py 项目： Sandy4321/pgbart

 def evaluate_new_subtree(self, data, node_id_start, param, nodes_subtree, cache, settings):
     for i in self.train_ids[node_id_start]:
         x_, y_ = data['x_train'][i, :], data['y_train'][i]
         node_id = copy(node_id_start)
         while True:
             self.sum_y_new[node_id] += y_
             self.sum_y2_new[node_id] += y_ ** 2
             self.n_points_new[node_id] += 1
             self.train_ids_new[node_id] = np.append(self.train_ids_new[node_id], i)
             if node_id in self.leaf_nodes:
                 break
             left, right = get_children_id(node_id)
             feat_id, split, idx_split_global = self.node_info_new[node_id]   # splitting on new criteria
             if x_[feat_id] <= split:
                 node_id = left
             else:
                 node_id = right
     for node_id in nodes_subtree:
         self.loglik_new[node_id] = -np.inf
         if self.n_points_new[node_id] > 0:
             self.loglik_new[node_id], self.param_n_new[node_id] = \
                     compute_normal_normalizer(self.sum_y_new[node_id], self.sum_y2_new[node_id], \
                             self.n_points_new[node_id], param, cache, settings)
         if node_id in self.leaf_nodes:
             if stop_split(self.train_ids_new[node_id], settings, data, cache):
             # if leaf is empty, logprior_new[node_id] = 0.0 is incorrect; however
             #      loglik_new[node_id] = -np.inf will reject move to a tree with empty leaves
                 self.logprior_new[node_id] = 0.0
             else:
                 # node with just 1 data point earlier could have more data points now 
                 self.logprior_new[node_id] = np.log(self.compute_pnosplit(node_id, param))
         else:
             # split probability might have changed if train_ids have changed
             self.recompute_prob_split(data, param, settings, cache, node_id)
     if settings.debug == 1:
         try:
             check_if_zero(self.loglik[node_id_start] - self.loglik_new[node_id_start])
         except AssertionError:
             print 'train_ids[node_id_start] = %s, train_ids_new[node_id_start] = %s' \
                     % (self.train_ids[node_id_start], self.train_ids_new[node_id_start])
             raise AssertionError

示例#3

显示文件

    def sample(self, data, settings, param, cache,databc,cachebc):
        if settings.mcmc_type == 'growprune':
            step_id = random.randint(0, 1)  # only grow and prune moves permitted
        elif settings.mcmc_type == 'cgm':
            step_id = random_pick([0,1,2,3], [0.25, 0.25, 0.4, 0.1])
        else:
            raise Exception('invalid mcmc_type')
        log_acc = -np.inf
        log_r = 0
        self.grow_nodes = [n_id for n_id in self.leaf_nodes \
                    if not stop_split(self.train_ids[n_id], settings, data, cache)]
        grow_nodes = self.grow_nodes

        if step_id == 0:
            if not grow_nodes:
                change = False
            else:
                node_id = random.choice(grow_nodes)
                if settings.verbose >= 1:
                    print 'grow_nodes = %s, chosen node_id = %s' % (grow_nodes, node_id)

                do_not_split_node_id, feat_id, split, idx_split_global, logprior_nodeid = \
                        self.sample_split_prior(data, param, settings, cache, node_id,databc,cachebc)

                if do_not_split_node_id==True:
                    change=False

                if do_not_split_node_id==False:

                    if settings.verbose >= 1:
                        print 'grow: do_not_split = %s, feat_id = %s, split = %s' \
                                % (do_not_split_node_id, feat_id, split)
                    train_ids = self.train_ids[node_id]

                    t0 = time.time()
                    if settings.parallelize_compute_statistics==0:
                        (train_ids_left, train_ids_right, cache_tmp, loglik_left, loglik_right) = \
                                compute_left_right_statistics(data, param, cache, train_ids, feat_id, split, settings)
                    else:
                        (cache_tmp, loglik_left, loglik_right) = \
                                compute_left_right_statistics_parallel(data, param, cache, train_ids, feat_id, split, settings, databc)

                        cond = data['x_train'][train_ids, feat_id] <= split
                        train_ids_left = train_ids[cond]
                        train_ids_right = train_ids[~cond]

                    if settings.timer==1:
                        settings.parallelize_compute_statistics_time += time.time()-t0

                    loglik = loglik_left + loglik_right

                    len_both_children_terminal_new = len(self.both_children_terminal)
                    if get_sibling_id(node_id) not in self.leaf_nodes:
                        len_both_children_terminal_new += 1


                    log_acc = self.compute_log_acc_g(node_id, param, len_both_children_terminal_new, \
                                loglik, train_ids_left, train_ids_right, cache, settings, data, grow_nodes)

                    log_r = np.log(np.random.rand(1))
                    if log_r <= log_acc:

                        self.update_left_right_statistics(cache_tmp, node_id, logprior_nodeid, \
                                train_ids_left, train_ids_right, loglik_left, loglik_right, \
                                feat_id, split, idx_split_global, settings, param, data, cache)

                        # MCMC specific data structure updates
                        self.both_children_terminal.append(node_id)
                        parent = get_parent_id(node_id)
                        if (node_id != 0) and (parent in self.non_leaf_nodes):
                            self.inner_pc_pairs.append((parent, node_id))
                        sibling = get_sibling_id(node_id)
                        if sibling in self.leaf_nodes:
                            self.both_children_terminal.remove(parent)
                        change = True
                    else:
                        change = False
        elif step_id == 1:
            if not self.both_children_terminal:
                change = False
            else:
                node_id = random.choice(self.both_children_terminal)
                feat_id = self.node_info[node_id][0]
                if settings.verbose >= 1:
                    print 'prune: node_id = %s, feat_id = %s' % (node_id, feat_id)
                left, right = get_children_id(node_id)
                loglik = self.loglik[left] + self.loglik[right]
                len_both_children_new = len(self.both_children_terminal)
                grow_nodes_tmp = grow_nodes[:]
                grow_nodes_tmp.append(node_id)
                try:
                    grow_nodes_tmp.remove(left)
                except ValueError:
                    pass
                try:
                    grow_nodes_tmp.remove(right)
                except ValueError:
                    pass
                log_acc = - self.compute_log_inv_acc_p(node_id, param, len_both_children_new, \
                                loglik, grow_nodes_tmp, cache, settings, data)
                log_r = np.log(np.random.rand(1))
                if log_r <= log_acc:
                    self.remove_leaf_node_statistics(left, settings)
                    self.remove_leaf_node_statistics(right, settings)
                    self.leaf_nodes.append(node_id)
                    self.non_leaf_nodes.remove(node_id)
                    self.logprior[node_id] = np.log(self.compute_pnosplit(node_id, param))
                    # OK to set logprior as above since we know that a valid split exists
                    # MCMC specific data structure updates
                    self.both_children_terminal.remove(node_id)
                    parent = get_parent_id(node_id) 
                    if (node_id != 0) and (parent in self.non_leaf_nodes):
                        self.inner_pc_pairs.remove((parent, node_id))
                    if node_id != 0:
                        sibling = get_sibling_id(node_id) 
                        if sibling in self.leaf_nodes:
                            if settings.debug == 1:
                                assert(parent not in self.both_children_terminal)
                            self.both_children_terminal.append(parent)
                    change = True
                else:
                    change = False
        elif step_id == 2:
            if not self.non_leaf_nodes:
                change = False
            else:
                node_id = random.choice(self.non_leaf_nodes)

                do_not_split_node_id, feat_id, split, idx_split_global, logprior_nodeid = \
                        self.sample_split_prior(data, param, settings, cache, node_id,databc,cachebc)
                if settings.verbose >= 1:
                    print 'change: node_id = %s, do_not_split = %s, feat_id = %s, split = %s' \
                            % (node_id, do_not_split_node_id, feat_id, split)
                # Note: this just samples a split criterion, not guaranteed to "change" 
                #assert(not do_not_split_node_id)

                if do_not_split_node_id==True:
                    change = False

                if do_not_split_node_id==False:

                    nodes_subtree = self.get_nodes_subtree(node_id)
                    nodes_not_in_subtree = self.get_nodes_not_in_subtree(node_id)
                    if settings.debug == 1:
                        set1 = set(list(nodes_subtree) + list(nodes_not_in_subtree))
                        set2 = set(self.leaf_nodes + self.non_leaf_nodes)
                        assert(sorted(set1) == sorted(set2))
                    self.create_new_statistics(nodes_subtree, nodes_not_in_subtree, node_id, settings)
                    self.node_info_new[node_id] = (feat_id, split, idx_split_global)
                    self.evaluate_new_subtree(data, node_id, param, nodes_subtree, cache, settings, databc, cachebc)
                    # log_acc will be be modified below
                    log_acc_tmp, loglik_diff, logprior_diff = self.compute_log_acc_cs(nodes_subtree, node_id)
                    if settings.debug == 1:
                        self.check_if_same(log_acc_tmp, loglik_diff, logprior_diff)
                    log_acc = log_acc_tmp + self.logprior[node_id] - self.logprior_new[node_id]
                    log_r = np.log(np.random.rand(1))
                    if log_r <= log_acc:
                        self.node_info[node_id] = copy(self.node_info_new[node_id])
                        self.update_subtree(node_id, nodes_subtree, settings)
                        change = True
                    else:
                        change = False
        elif step_id == 3:
            if not self.inner_pc_pairs:
                change = False 
            else:
                node_id, child_id = random.choice(self.inner_pc_pairs)
                nodes_subtree = self.get_nodes_subtree(node_id)
                nodes_not_in_subtree = self.get_nodes_not_in_subtree(node_id)
                if settings.debug == 1:
                    set1 = set(list(nodes_subtree) + list(nodes_not_in_subtree))
                    set2 = set(self.leaf_nodes + self.non_leaf_nodes)
                    assert(sorted(set1) == sorted(set2))
                self.create_new_statistics(nodes_subtree, nodes_not_in_subtree, node_id, settings)
                self.node_info_new[node_id] = copy(self.node_info[child_id])
                self.node_info_new[child_id] = copy(self.node_info[node_id])
                if settings.verbose >= 1:
                    print 'swap: node_id = %s, child_id = %s' % (node_id, child_id)
                    print 'node_info[node_id] = %s, node_info[child_id] = %s' \
                            % (self.node_info[node_id], self.node_info[child_id])
                self.evaluate_new_subtree(data, node_id, param, nodes_subtree, cache, settings, databc, cachebc)
                log_acc, loglik_diff, logprior_diff = self.compute_log_acc_cs(nodes_subtree, node_id)
                if settings.debug == 1:
                    self.check_if_same(log_acc, loglik_diff, logprior_diff)
                log_r = np.log(np.random.rand(1))
                if log_r <= log_acc:
                    self.node_info[node_id] = copy(self.node_info_new[node_id])
                    self.node_info[child_id] = copy(self.node_info_new[child_id])
                    self.update_subtree(node_id, nodes_subtree, settings)
                    change = True
                else:
                    change = False
        if settings.verbose >= 1:
            print 'trying move: step_id = %d, move = %s, log_acc = %s, log_r = %s' \
                    % (step_id, STEP_NAMES[step_id], log_acc, log_r)
        if change:
            self.depth = max([get_depth(node_id) for node_id in \
                    self.leaf_nodes])
            self.loglik_current = sum([self.loglik[node_id] for node_id in \
                    self.leaf_nodes])
            if settings.verbose >= 1:
                print 'accepted move: step_id = %d, move = %s' % (step_id, STEP_NAMES[step_id])
                self.print_stuff()
        if settings.debug == 1:
            both_children_terminal, inner_pc_pairs = self.recompute_mcmc_data_structures()
            print '\nstats from recompute_mcmc_data_structures'
            print 'both_children_terminal = %s' % both_children_terminal
            print 'inner_pc_pairs = %s' % inner_pc_pairs
            assert(sorted(both_children_terminal) == sorted(self.both_children_terminal))
            assert(sorted(inner_pc_pairs) == sorted(self.inner_pc_pairs))
            grow_nodes_new = [n_id for n_id in self.leaf_nodes \
                    if not stop_split(self.train_ids[n_id], settings, data, cache)]
            if change and (step_id == 1):
                print 'grow_nodes_new = %s, grow_nodes_tmp = %s' % (sorted(grow_nodes_new), sorted(grow_nodes_tmp))
                assert(sorted(grow_nodes_new) == sorted(grow_nodes_tmp))
        return (change, step_id)

示例#4

显示文件

文件： treemcmc.py 项目： Sandy4321/pgbart

 def sample(self, data, settings, param, cache):
     if settings.mcmc_type == 'growprune':
         step_id = random.randint(0, 1)  # only grow and prune moves permitted
     elif settings.mcmc_type == 'cgm':
         step_id = random.randint(0, 3)  # all 4 moves equally likely (or think of 50% grow/prune, 25% change, 25% swap)
     else:
         raise Exception('invalid mcmc_type')
     log_acc = -np.inf
     log_r = 0
     self.grow_nodes = [n_id for n_id in self.leaf_nodes \
                 if not stop_split(self.train_ids[n_id], settings, data, cache)]
     grow_nodes = self.grow_nodes
     if step_id == 0:        # GROW step
         if not grow_nodes:
             change = False
         else:
             node_id = random.choice(grow_nodes)
             if settings.verbose >= 1:
                 print 'grow_nodes = %s, chosen node_id = %s' % (grow_nodes, node_id)
             do_not_split_node_id, feat_id, split, idx_split_global, logprior_nodeid = \
                     self.sample_split_prior(data, param, settings, cache, node_id)
             assert not do_not_split_node_id
             if settings.verbose >= 1:
                 print 'grow: do_not_split = %s, feat_id = %s, split = %s' \
                         % (do_not_split_node_id, feat_id, split)
             train_ids = self.train_ids[node_id]
             (train_ids_left, train_ids_right, cache_tmp, loglik_left, loglik_right) = \
                 compute_left_right_statistics(data, param, cache, train_ids, \
                     feat_id, split, settings)
             loglik = loglik_left + loglik_right
             len_both_children_terminal_new = len(self.both_children_terminal)
             if get_sibling_id(node_id) not in self.leaf_nodes:
                 len_both_children_terminal_new += 1
             log_acc = self.compute_log_acc_g(node_id, param, len_both_children_terminal_new, \
                         loglik, train_ids_left, train_ids_right, cache, settings, data, grow_nodes)
             log_r = np.log(np.random.rand(1))
             if log_r <= log_acc:
                 self.update_left_right_statistics(cache_tmp, node_id, logprior_nodeid, \
                         train_ids_left, train_ids_right, loglik_left, loglik_right, \
                         feat_id, split, idx_split_global, settings, param, data, cache)
                 # MCMC specific data structure updates
                 self.both_children_terminal.append(node_id)
                 parent = get_parent_id(node_id) 
                 if (node_id != 0) and (parent in self.non_leaf_nodes):
                     self.inner_pc_pairs.append((parent, node_id))
                 sibling = get_sibling_id(node_id)
                 if sibling in self.leaf_nodes:
                     self.both_children_terminal.remove(parent)
                 change = True
             else:
                 change = False
     elif step_id == 1:      # PRUNE step
         if not self.both_children_terminal:
             change = False      # nothing to prune here
         else:
             node_id = random.choice(self.both_children_terminal)
             feat_id = self.node_info[node_id][0]
             if settings.verbose >= 1:
                 print 'prune: node_id = %s, feat_id = %s' % (node_id, feat_id)
             left, right = get_children_id(node_id)
             loglik = self.loglik[left] + self.loglik[right]
             len_both_children_new = len(self.both_children_terminal)
             grow_nodes_tmp = grow_nodes[:]
             grow_nodes_tmp.append(node_id)
             try:
                 grow_nodes_tmp.remove(left)
             except ValueError:
                 pass
             try:
                 grow_nodes_tmp.remove(right)
             except ValueError:
                 pass
             log_acc = - self.compute_log_inv_acc_p(node_id, param, len_both_children_new, \
                             loglik, grow_nodes_tmp, cache, settings, data)
             log_r = np.log(np.random.rand(1))
             if log_r <= log_acc:
                 self.remove_leaf_node_statistics(left, settings)
                 self.remove_leaf_node_statistics(right, settings)
                 self.leaf_nodes.append(node_id)
                 self.non_leaf_nodes.remove(node_id)
                 self.logprior[node_id] = np.log(self.compute_pnosplit(node_id, param))
                 # OK to set logprior as above since we know that a valid split exists
                 # MCMC specific data structure updates
                 self.both_children_terminal.remove(node_id)
                 parent = get_parent_id(node_id) 
                 if (node_id != 0) and (parent in self.non_leaf_nodes):
                     self.inner_pc_pairs.remove((parent, node_id))
                 if node_id != 0:
                     sibling = get_sibling_id(node_id) 
                     if sibling in self.leaf_nodes:
                         if settings.debug == 1:
                             assert(parent not in self.both_children_terminal)
                         self.both_children_terminal.append(parent)
                 change = True
             else:
                 change = False
     elif step_id == 2:      # CHANGE
         if not self.non_leaf_nodes:
             change = False
         else:
             node_id = random.choice(self.non_leaf_nodes)
             do_not_split_node_id, feat_id, split, idx_split_global, logprior_nodeid = \
                     self.sample_split_prior(data, param, settings, cache, node_id)
             if settings.verbose >= 1:
                 print 'change: node_id = %s, do_not_split = %s, feat_id = %s, split = %s' \
                         % (node_id, do_not_split_node_id, feat_id, split)
             # Note: this just samples a split criterion, not guaranteed to "change" 
             assert(not do_not_split_node_id)
             nodes_subtree = self.get_nodes_subtree(node_id)
             nodes_not_in_subtree = self.get_nodes_not_in_subtree(node_id)
             if settings.debug == 1:
                 set1 = set(list(nodes_subtree) + list(nodes_not_in_subtree))
                 set2 = set(self.leaf_nodes + self.non_leaf_nodes)
                 assert(sorted(set1) == sorted(set2))
             self.create_new_statistics(nodes_subtree, nodes_not_in_subtree, node_id, settings)
             self.node_info_new[node_id] = (feat_id, split, idx_split_global)         
             self.evaluate_new_subtree(data, node_id, param, nodes_subtree, cache, settings)
             # log_acc will be be modified below
             log_acc_tmp, loglik_diff, logprior_diff = self.compute_log_acc_cs(nodes_subtree, node_id)
             if settings.debug == 1:
                 self.check_if_same(log_acc_tmp, loglik_diff, logprior_diff)
             log_acc = log_acc_tmp + self.logprior[node_id] - self.logprior_new[node_id]
             log_r = np.log(np.random.rand(1))
             if log_r <= log_acc:
                 self.node_info[node_id] = copy(self.node_info_new[node_id])
                 self.update_subtree(node_id, nodes_subtree, settings)
                 change = True
             else:
                 change = False
     elif step_id == 3:      # SWAP
         if not self.inner_pc_pairs:
             change = False 
         else:
             node_id, child_id = random.choice(self.inner_pc_pairs)
             nodes_subtree = self.get_nodes_subtree(node_id)
             nodes_not_in_subtree = self.get_nodes_not_in_subtree(node_id)
             if settings.debug == 1:
                 set1 = set(list(nodes_subtree) + list(nodes_not_in_subtree))
                 set2 = set(self.leaf_nodes + self.non_leaf_nodes)
                 assert(sorted(set1) == sorted(set2))
             self.create_new_statistics(nodes_subtree, nodes_not_in_subtree, node_id, settings)
             self.node_info_new[node_id] = copy(self.node_info[child_id])
             self.node_info_new[child_id] = copy(self.node_info[node_id])
             if settings.verbose >= 1:
                 print 'swap: node_id = %s, child_id = %s' % (node_id, child_id)
                 print 'node_info[node_id] = %s, node_info[child_id] = %s' \
                         % (self.node_info[node_id], self.node_info[child_id])
             self.evaluate_new_subtree(data, node_id, param, nodes_subtree, cache, settings)
             log_acc, loglik_diff, logprior_diff = self.compute_log_acc_cs(nodes_subtree, node_id)
             if settings.debug == 1:
                 self.check_if_same(log_acc, loglik_diff, logprior_diff)
             log_r = np.log(np.random.rand(1))
             if log_r <= log_acc:
                 self.node_info[node_id] = copy(self.node_info_new[node_id])
                 self.node_info[child_id] = copy(self.node_info_new[child_id])
                 self.update_subtree(node_id, nodes_subtree, settings)
                 change = True
             else:
                 change = False
     if settings.verbose >= 1:
         print 'trying move: step_id = %d, move = %s, log_acc = %s, log_r = %s' \
                 % (step_id, STEP_NAMES[step_id], log_acc, log_r)
     if change:
         self.depth = max([get_depth(node_id) for node_id in \
                 self.leaf_nodes])
         self.loglik_current = sum([self.loglik[node_id] for node_id in \
                 self.leaf_nodes])
         if settings.verbose >= 1:
             print 'accepted move: step_id = %d, move = %s' % (step_id, STEP_NAMES[step_id])
             self.print_stuff()
     if settings.debug == 1:
         both_children_terminal, inner_pc_pairs = self.recompute_mcmc_data_structures()
         print '\nstats from recompute_mcmc_data_structures'
         print 'both_children_terminal = %s' % both_children_terminal
         print 'inner_pc_pairs = %s' % inner_pc_pairs
         assert(sorted(both_children_terminal) == sorted(self.both_children_terminal))
         assert(sorted(inner_pc_pairs) == sorted(self.inner_pc_pairs))
         grow_nodes_new = [n_id for n_id in self.leaf_nodes \
                 if not stop_split(self.train_ids[n_id], settings, data, cache)]
         if change and (step_id == 1):
             print 'grow_nodes_new = %s, grow_nodes_tmp = %s' % (sorted(grow_nodes_new), sorted(grow_nodes_tmp))
             assert(sorted(grow_nodes_new) == sorted(grow_nodes_tmp))
     return (change, step_id)