Пример #1
0
def build(examples):
    predicate = best_predicate(examples)
    if predicate == None:
        return Leaf(probability_distribution(examples))
    yesses, nos = partition(predicate, examples)
    if len(yesses) == len(examples):
        return Leaf(probability_distribution(yesses))
    if len(nos) == len(examples):
        return Leaf(probability_distribution(nos))
    return Tree(predicate, build(yesses), build(nos),
                weigh(yesses, nos, len(examples)))
Пример #2
0
    def get_forest(self, size):
        bootstrap = Bootstrap(copy.deepcopy(self.data))

        bootstrap_sets = []
        for i in range(0, size):
            bootstrap_sets.append(bootstrap.get_partition())

        trees = []
        for bootstrap in bootstrap_sets:
            trees.append(Tree(copy.deepcopy(bootstrap[0]), True))

        return trees
Пример #3
0
    def fit(self, X, y, eval=False):
        """
        X: np.array [N,M]
        y: np.array [N,]
        """
        if eval:
            X_tr, X_val, y_tr, y_val = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
        else:
            X_tr, y_tr = X, y

        for estimator_idx in tqdm(range(self.n_estimators)):
            print('esitmator', estimator_idx)
            if estimator_idx == 0:
                F = self.get_F(X_tr, y_tr)
            else:
                F = self.get_F(X_tr)
            # print (F)
            residual = self.loss.get_residual(y_tr, F)
            # assert y.shape[0] == residual.shape[0],'y shape {}, residual shape {}'.format(y.shape,residual.shape)
            tree = Tree(max_depth=self.max_depth,min_criterion_improve=self.min_criterion_improve,\
            min_samples_leaf = self.min_samples_leaf)
            tree.fit(X_tr, residual)
            self.loss.update_terminal_region(tree, X_tr, y_tr, residual)
            self.trees.append(tree)

            if eval:
                time_start = time.clock()
                y_tr_pred = self.predict(X_tr)
                y_pred = self.predict(X_val)
                #run your code
                time_elapsed = (time.clock() - time_start)
                print(
                    'current tr accu:{}, val accu: {},inference time consumption: {}'
                    .format(accuracy_score(y_tr, y_tr_pred),
                            accuracy_score(y_val, y_pred), time_elapsed))
Пример #4
0
def test_tree():
    from collections import namedtuple
    #Use namedTuple for better readability in this example
    Data = namedtuple('Data', ["outlook", "temperature", "humidity", "wind", "target"])

    data = [Data("sunny", "hot", "high", "false", "no"), 
            Data("sunny", "hot", "high", "true", "no"),
            Data("overcast", "hot", "high", "false", "yes"),
            Data("rain", "mild", "high", "false", "yes"),
            Data("rain", "cool", "normal", "false", "yes"),
            Data("rain", "cool", "normal", "true", "no"),
            Data("overcast", "cool", "normal", "true", "yes"),
            Data("sunny", "mild", "high", "false", "no"),
            Data("sunny", "cool", "normal", "false", "yes"),
            Data("rain", "mild", "normal", "false", "yes"),
            Data("sunny", "mild", "normal", "true", "yes"),
            Data("overcast", "mild", "high", "true", "yes"),
            Data("overcast", "hot", "normal", "false", "yes"),
            Data("rain", "mild", "high", "true", "no")
            ] 
    tree = Tree() 
    my_tree = tree.fit(data) # build tree
    tree.print_tree(my_tree)
Пример #5
0
 def fit(self, data):
     # 掐头去尾, 删除id和label,得到特征名称
     self.features = list(data.columns)[1:-1]
     # 获取所有类别
     self.classes = data['label'].unique().astype(str)
     # 初始化多分类损失函数的参数 K
     self.loss.init_classes(self.classes)
     # 根据类别将‘label’列进行one-hot处理
     for class_name in self.classes:
         label_name = 'label_' + class_name
         data[label_name] = data['label'].apply(
             lambda x: 1 if str(x) == class_name else 0)
         # 初始化 f_0(x)
         self.f_0[class_name] = self.loss.initialize_f_0(data, class_name)
     # print(lstm-ptb-data)
     # 对 m = 1, 2, ..., M
     logger.handlers[0].setLevel(
         logging.INFO if self.is_log else logging.CRITICAL)
     for iter in range(1, self.n_trees + 1):
         if len(logger.handlers) > 1:
             logger.removeHandler(logger.handlers[-1])
         fh = logging.FileHandler('results/NO.{}_tree.log'.format(iter),
                                  mode='w',
                                  encoding='utf-8')
         fh.setLevel(logging.DEBUG)
         logger.addHandler(fh)
         logger.info((
             '-----------------------------构建第%d颗树-----------------------------'
             % iter))
         # 这里计算负梯度整体计算是为了计算p_sum的一致性
         self.loss.calculate_residual(data, iter)
         self.trees[iter] = {}
         for class_name in self.classes:
             target_name = 'res_' + class_name + '_' + str(iter)
             self.trees[iter][class_name] = Tree(data, self.max_depth,
                                                 self.min_samples_split,
                                                 self.features, self.loss,
                                                 target_name, logger)
             self.loss.update_f_m(data, self.trees, iter, class_name,
                                  self.learning_rate, logger)
         if self.is_plot:
             plot_multi(self.trees[iter],
                        max_depth=self.max_depth,
                        iter=iter)
     if self.is_plot:
         plot_all_trees(self.n_trees)
Пример #6
0
 def fit(self, data):
     '''
     :param x: pandas.DataFrame, the features data of train training  
     :param y: list, the label of training
     '''
     # 去头掐尾, 删除id和label,得到特征名称
     self.features = list(data.columns)[1: -1]
     # 初始化 f_0(x)
     # 对于平方损失来说,初始化 f_0(x) 就是 y 的均值
     self.f_0 = self.loss_function.initialize_f_0(data)
     # 对 m = 1, 2, ..., M
     logger.setLevel(logging.INFO if self.is_log else logging.CRITICAL)
     for iter in range(1, self.n_trees+1):
         # 计算负梯度--对于平方误差来说就是残差
         logger.info(('-----------------------------构建第%d颗树-----------------------------' % iter))
         self.loss_function.calculate_residual(data, iter)
         self.trees[iter] = Tree(data, self.max_depth, self.features, iter,logger)
         self.loss_function.update_f_m(data, self.trees, iter, self.learning_rate,logger)
Пример #7
0
 def fit(self, data):
     """
     :param data: pandas.DataFrame, the features lstm-ptb-data of train training
     """
     # 掐头去尾, 删除id和label,得到特征名称
     self.features = list(data.columns)[1:-1]
     # 初始化 f_0(x)
     # 对于平方损失来说,初始化 f_0(x) 就是 y 的均值
     self.f_0 = self.loss.initialize_f_0(data)
     # 对 m = 1, 2, ..., M
     logger.handlers[0].setLevel(
         logging.INFO if self.is_log else logging.CRITICAL)
     for iter in range(1, self.n_trees + 1):
         if len(logger.handlers) > 1:
             logger.removeHandler(logger.handlers[-1])
         fh = logging.FileHandler('results/NO.{}_tree.log'.format(iter),
                                  mode='w',
                                  encoding='utf-8')
         fh.setLevel(logging.DEBUG)
         logger.addHandler(fh)
         # 计算负梯度--对于平方误差来说就是残差
         logger.info((
             '-----------------------------构建第%d颗树-----------------------------'
             % iter))
         self.loss.calculate_residual(data, iter)
         target_name = 'res_' + str(iter)
         self.trees[iter] = Tree(data, self.max_depth,
                                 self.min_samples_split, self.features,
                                 self.loss, target_name, logger)
         self.loss.update_f_m(data, self.trees, iter, self.learning_rate,
                              logger)
         if self.is_plot:
             plot_tree(self.trees[iter],
                       max_depth=self.max_depth,
                       iter=iter)
     # print(self.trees)
     if self.is_plot:
         plot_all_trees(self.n_trees)