def build(examples): predicate = best_predicate(examples) if predicate == None: return Leaf(probability_distribution(examples)) yesses, nos = partition(predicate, examples) if len(yesses) == len(examples): return Leaf(probability_distribution(yesses)) if len(nos) == len(examples): return Leaf(probability_distribution(nos)) return Tree(predicate, build(yesses), build(nos), weigh(yesses, nos, len(examples)))
def get_forest(self, size): bootstrap = Bootstrap(copy.deepcopy(self.data)) bootstrap_sets = [] for i in range(0, size): bootstrap_sets.append(bootstrap.get_partition()) trees = [] for bootstrap in bootstrap_sets: trees.append(Tree(copy.deepcopy(bootstrap[0]), True)) return trees
def fit(self, X, y, eval=False): """ X: np.array [N,M] y: np.array [N,] """ if eval: X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42) else: X_tr, y_tr = X, y for estimator_idx in tqdm(range(self.n_estimators)): print('esitmator', estimator_idx) if estimator_idx == 0: F = self.get_F(X_tr, y_tr) else: F = self.get_F(X_tr) # print (F) residual = self.loss.get_residual(y_tr, F) # assert y.shape[0] == residual.shape[0],'y shape {}, residual shape {}'.format(y.shape,residual.shape) tree = Tree(max_depth=self.max_depth,min_criterion_improve=self.min_criterion_improve,\ min_samples_leaf = self.min_samples_leaf) tree.fit(X_tr, residual) self.loss.update_terminal_region(tree, X_tr, y_tr, residual) self.trees.append(tree) if eval: time_start = time.clock() y_tr_pred = self.predict(X_tr) y_pred = self.predict(X_val) #run your code time_elapsed = (time.clock() - time_start) print( 'current tr accu:{}, val accu: {},inference time consumption: {}' .format(accuracy_score(y_tr, y_tr_pred), accuracy_score(y_val, y_pred), time_elapsed))
def test_tree(): from collections import namedtuple #Use namedTuple for better readability in this example Data = namedtuple('Data', ["outlook", "temperature", "humidity", "wind", "target"]) data = [Data("sunny", "hot", "high", "false", "no"), Data("sunny", "hot", "high", "true", "no"), Data("overcast", "hot", "high", "false", "yes"), Data("rain", "mild", "high", "false", "yes"), Data("rain", "cool", "normal", "false", "yes"), Data("rain", "cool", "normal", "true", "no"), Data("overcast", "cool", "normal", "true", "yes"), Data("sunny", "mild", "high", "false", "no"), Data("sunny", "cool", "normal", "false", "yes"), Data("rain", "mild", "normal", "false", "yes"), Data("sunny", "mild", "normal", "true", "yes"), Data("overcast", "mild", "high", "true", "yes"), Data("overcast", "hot", "normal", "false", "yes"), Data("rain", "mild", "high", "true", "no") ] tree = Tree() my_tree = tree.fit(data) # build tree tree.print_tree(my_tree)
def fit(self, data): # 掐头去尾, 删除id和label,得到特征名称 self.features = list(data.columns)[1:-1] # 获取所有类别 self.classes = data['label'].unique().astype(str) # 初始化多分类损失函数的参数 K self.loss.init_classes(self.classes) # 根据类别将‘label’列进行one-hot处理 for class_name in self.classes: label_name = 'label_' + class_name data[label_name] = data['label'].apply( lambda x: 1 if str(x) == class_name else 0) # 初始化 f_0(x) self.f_0[class_name] = self.loss.initialize_f_0(data, class_name) # print(lstm-ptb-data) # 对 m = 1, 2, ..., M logger.handlers[0].setLevel( logging.INFO if self.is_log else logging.CRITICAL) for iter in range(1, self.n_trees + 1): if len(logger.handlers) > 1: logger.removeHandler(logger.handlers[-1]) fh = logging.FileHandler('results/NO.{}_tree.log'.format(iter), mode='w', encoding='utf-8') fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info(( '-----------------------------构建第%d颗树-----------------------------' % iter)) # 这里计算负梯度整体计算是为了计算p_sum的一致性 self.loss.calculate_residual(data, iter) self.trees[iter] = {} for class_name in self.classes: target_name = 'res_' + class_name + '_' + str(iter) self.trees[iter][class_name] = Tree(data, self.max_depth, self.min_samples_split, self.features, self.loss, target_name, logger) self.loss.update_f_m(data, self.trees, iter, class_name, self.learning_rate, logger) if self.is_plot: plot_multi(self.trees[iter], max_depth=self.max_depth, iter=iter) if self.is_plot: plot_all_trees(self.n_trees)
def fit(self, data): ''' :param x: pandas.DataFrame, the features data of train training :param y: list, the label of training ''' # 去头掐尾, 删除id和label,得到特征名称 self.features = list(data.columns)[1: -1] # 初始化 f_0(x) # 对于平方损失来说,初始化 f_0(x) 就是 y 的均值 self.f_0 = self.loss_function.initialize_f_0(data) # 对 m = 1, 2, ..., M logger.setLevel(logging.INFO if self.is_log else logging.CRITICAL) for iter in range(1, self.n_trees+1): # 计算负梯度--对于平方误差来说就是残差 logger.info(('-----------------------------构建第%d颗树-----------------------------' % iter)) self.loss_function.calculate_residual(data, iter) self.trees[iter] = Tree(data, self.max_depth, self.features, iter,logger) self.loss_function.update_f_m(data, self.trees, iter, self.learning_rate,logger)
def fit(self, data): """ :param data: pandas.DataFrame, the features lstm-ptb-data of train training """ # 掐头去尾, 删除id和label,得到特征名称 self.features = list(data.columns)[1:-1] # 初始化 f_0(x) # 对于平方损失来说,初始化 f_0(x) 就是 y 的均值 self.f_0 = self.loss.initialize_f_0(data) # 对 m = 1, 2, ..., M logger.handlers[0].setLevel( logging.INFO if self.is_log else logging.CRITICAL) for iter in range(1, self.n_trees + 1): if len(logger.handlers) > 1: logger.removeHandler(logger.handlers[-1]) fh = logging.FileHandler('results/NO.{}_tree.log'.format(iter), mode='w', encoding='utf-8') fh.setLevel(logging.DEBUG) logger.addHandler(fh) # 计算负梯度--对于平方误差来说就是残差 logger.info(( '-----------------------------构建第%d颗树-----------------------------' % iter)) self.loss.calculate_residual(data, iter) target_name = 'res_' + str(iter) self.trees[iter] = Tree(data, self.max_depth, self.min_samples_split, self.features, self.loss, target_name, logger) self.loss.update_f_m(data, self.trees, iter, self.learning_rate, logger) if self.is_plot: plot_tree(self.trees[iter], max_depth=self.max_depth, iter=iter) # print(self.trees) if self.is_plot: plot_all_trees(self.n_trees)