def __init__(self, feature_names: list, max_node_size=10, divide_way='half', cost_func=square_error): """ feature_names : list of string 就是特征名字的列表啦,与矩阵的列号对应,一直都不变 cost_func : cost function """ self.id = next(generate_id) self.feature_names = feature_names self.cost_func = cost_func self.is_leaf = False self.feature_column = None # 数据集划分点所在特征的列号 self.split_op = None # 数据集划分所使用标志,离散用‘==’,连续用‘<=' self.split_value = None # 数据集划分使用该特征的值 self.left_tree = None self.right_tree = None self.current_node_value = None self.max_node_size = max_node_size self.divide_way = divide_way logger.debug('self.id : {}'.format(self.id) + '\nself.feature_names : {}'.format(self.feature_names) + '\nself.max_node_size : {}'.format(self.max_node_size))
def fit_and_valid(self, X, Y, X_valid, Y_valid, watch=False): self.init_mini_batches(X, Y) logger.debug('X : \n{} Y : {}'.format(X, Y)) init_estimator = self.base_estimator( max_node_size=self.max_tree_node_size, divide_way=self.base_divide_way) init_estimator.fit(X, Y) self.parameters['f'].append(init_estimator) self.parameters['lr'].append(1) for i in range(self.n_estimators): if self.mini_batch == 0: # 使用全部样例: cost = self.optimizer(X, Y) else: # 使用mini_batch个样例: X_batch, Y_batch = self.get_mini_batch() cost = self.optimizer(X_batch, Y_batch) if i % self.print_interval == 0: this_loss = self.get_test_cost(X_valid, Y_valid) train_loss = self.get_test_cost(X, Y) self.information['valid_loss'].append(this_loss) self.information['train_loss'].append(train_loss) logger.info( 'train {}/{} current cost: {},train : {} valid : {}'. format(i, self.n_estimators, cost, train_loss, this_loss)) # print('train {}/{} current cost : {}'.format(i,self.n_estimators,cost)) self.update_mini_batch()
def optimize_single(self, w, X, Y): """ Parameters ---------- w : 2d array-like shape(n_features+1, 1) X : 2d array-like shape(n_samples, n_features+1) Y : 1d array-like shape(n_samples, 1) """ learning_rate = self.learning_rate mini_batch = self.mini_batch # TODO: 随机 # 使用mini_batch个样例: if self.mini_batch == 0: grads, cost = self.propagate(w, X, Y) else: all_indices = np.arange(0, X.shape[0]) np.random.shuffle(all_indices) batch_indices = all_indices[:mini_batch] grads, cost = self.propagate(w, X[batch_indices], Y[batch_indices]) dw = grads["dw"] logger.debug(dw.shape) w = w - learning_rate * dw params = {"w": w} grads = {"dw": dw} return params, grads, cost
def backward(self): """ 计算梯度 """ if self.optimizer == 'mini-batch': X, Y = self.get_mini_batch() else: X, Y = self.X_train, self.Y_train m = X.shape[1] # First, retrieve W1 and W2 from the dictionary "parameters". W1 = self.parameters["W1"] W2 = self.parameters["W2"] # Retrieve also A1 and A2 from dictionary "cache". A1 = self.cache["A1"] A2 = self.cache["A2"] # Backward propagation: calculate dW1, db1, dW2, db2. dZ2 = A2 - Y dW2 = 1 / m * np.dot(dZ2, A1.T) db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True) dZ1 = np.dot(W2.T, dZ2) * (1 - np.power(A1, 2)) dW1 = 1 / m * np.dot(dZ1, X.T) db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True) self.grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2} logger.debug('grads : \n{}'.format(self.grads)) return
def get_loss(self, X_valid, Y_valid): """ 计算这一个样例的相关系数 """ Y_valid = Y_valid.reshape(-1) y_pred = self.predict(X_valid) logger.debug('y_pred : shape{}'.format(y_pred.shape)) logger.debug('Y_valid : shape{}'.format(Y_valid.shape)) return pearson_correlation(y_pred, Y_valid)
def predict(self, X_test): """ Parameters -------------- X_test : shape(n_samples, n_features) """ A2 = self.forward(X_test=X_test.T, predict=True) logger.debug('A2 : \n{}'.format(A2)) return np.round(A2).reshape(-1)
def fit(self, X, Y, watch=False): logger.debug('X : \n{} Y : {}'.format(X, Y)) init_estimator = self.base_estimator( max_node_size=self.max_tree_node_size, divide_way=self.base_divide_way) init_estimator.fit(X, Y) self.parameters['f'].append(init_estimator) self.parameters['lr'].append(1) for i in range(self.n_estimators): cost = self.optimizer(X, Y) if i % self.print_interval == 0: logger.info('train {}/{} current cost: {}'.format( i, self.n_estimators, cost))
def optimize_single(self, w, b, X, Y): """ Parameters ---------- w : 2d array-like shape(n_features, 1) b : 1d array-like shape(1,) X : 2d array-like shape(n_features, n_samples) Y : 1d array-like shape(1, n_samples) """ logger.debug('X : {}\nshape:{}'.format(X, X.shape)) logger.debug('Y : {}\nshape:{}'.format(Y, Y.shape)) learning_rate = self.learning_rate mini_batch = self.mini_batch if self.mini_batch == 0: grads, cost = self.propagate(w, b, X, Y) else: X_batch, Y_batch = self.get_mini_batch() logger.debug('X_batch : {}\nshape:{}'.format( X_batch, X_batch.shape)) logger.debug('Y_batch : {}\nshape:{}'.format( Y_batch, Y_batch.shape)) grads, cost = self.propagate(w, b, X_batch, Y_batch) dw = grads["dw"] db = grads["db"] w = w - learning_rate * dw b = b - learning_rate * db params = {"w": w, "b": b} grads = {"dw": dw, "db": db} return params, grads, cost
def initialize_parameters(self, n_x, n_h, n_y): """ 初始化参数 """ np.random.seed( 2 ) # we set up a seed so that your output matches ours although the initialization is random. W1 = np.random.randn(n_h, n_x) * 0.01 b1 = np.zeros((n_h, 1)) W2 = np.random.randn(n_y, n_h) * 0.01 b2 = np.zeros((n_y, 1)) assert (W1.shape == (n_h, n_x)) assert (b1.shape == (n_h, 1)) assert (W2.shape == (n_y, n_h)) assert (b2.shape == (n_y, 1)) parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} logger.debug('initialize_parameters : {}'.format(parameters)) return parameters
def propagate(self, w, X, Y): """ Parameters ---------- w : 2d array-like shape(n_features+1, 1) X : 2d array-like shape(n_samples, n_features+1) Y : 1d array-like shape(n_samples, 1) """ m = X.shape[0] # m = n_samples n_features_plus = X.shape[1] Y_hat = (w.T * X).sum(axis=1).reshape(m, 1) # Y_hat shape(n_samples, 1) cost = 1 / (2 * m) * np.square(Y_hat - Y).sum() # dw = np.zeros((n_features_plus, 1)) for j in range(0, n_features_plus): # 计算n_features个参数对应的梯度 dw[j, 0] = 1 / m * ((Y_hat - Y) * (X[:, j].reshape(-1, 1))).sum() cost = np.squeeze(cost) grads = {"dw": dw} logger.debug('dw :{}'.format(dw.shape)) return grads, cost
def feat_data(self, X_train, Y_train, X_valid=None, Y_valid=None): """ 将数据弄进去,构建输入层,隐藏层,输出层 Parameters ------------ X_train : shape(n_samples, n_features) Y_train : shape(n_samples, ) X_valid : shape(n_samples, n_features) Y_valid : shape(n_samples, ) """ X_train = X_train.T Y_train = Y_train.reshape((1, -1)) self.init_mini_batches(X_train, Y_train) if X_valid is not None: X_valid = X_valid.T if Y_valid is not None: Y_valid = Y_valid.reshape((1, -1)) n_x = X_train.shape[0] # size of input layer n_h = self.hidden_size n_y = Y_train.shape[0] # size of output layer self.structure = (n_x, n_h, n_y) logger.debug('self.structure : {}'.format(self.structure)) self.X_train = X_train self.Y_train = Y_train self.X_valid = X_valid self.Y_valid = Y_valid self.parameters = self.initialize_parameters(n_x, n_h, n_y) logger.debug('self.X_train : \n{}\nshape : {}'.format( self.X_train, self.X_train.shape)) logger.debug('self.Y_train : \n{}\nshape : {}'.format( self.Y_train, self.Y_train.shape))
def precision_score(y_true, y_pred): """Compute the precision 适用于分类问题,直接计算预测出来的Y的正确率 Parameters -------------- y_true : 1d array-like Ground truth (correct) target values. y_pred : 1d array-like Estimated targets as returned by a classifier. Return ------- accuracy_rate : double """ y_true = y_true.reshape(-1).astype(int) y_pred = y_pred.reshape(-1).astype(int) assert (len(y_pred) == len(y_true)) # assert(len(y_pred.shape) == 1) # assert(len(y_true.shape) == 1) total_num = len(y_pred) success_num = 0 for i in range(0, total_num): if (y_true[i] == y_pred[i]): success_num += 1 logger.debug('y_true : {}'.format(y_true)) logger.debug('y_pred : {}'.format(y_pred)) result = float(success_num) / total_num logger.debug('result : {}'.format(result)) return result
def fit(self, X, Y, column_flags, feature_names=None): """ Parameters ----------- X : 2d array-like Y : 1d array-like """ logger.debug('X : \n{}'.format(X)) logger.debug('Y : {}'.format(Y)) n_samples = X.shape[0] n_features = X.shape[1] if feature_names is None: feature_names = [str(i) for i in range(n_features)] logger.debug('feature_names : {}'.format(feature_names)) self.root_node = CartTreeClassifierNode( feature_names, column_flags, max_node_size=self.max_node_size, divide_way=self.divide_way) self.root_node.fit_data(X, Y, None)
def get_train_and_valid_result(self): """ Returns ------------- train_loss : float valid_loss : float """ Y_train_pred = np.round(self.forward(X_test=self.X_train, predict=True)).reshape((-1)) logger.debug('Y_train_pred : \n{}'.format(Y_train_pred)) logger.debug('self.Y_train : \n{}'.format(self.Y_train)) train_loss = precision_score(Y_train_pred, self.Y_train.reshape(-1)) if self.X_valid is not None: Y_valid_pred = np.round( self.forward(X_test=self.X_valid, predict=True)) Y_valid_pred.reshape((-1)) logger.debug('Y_valid_pred : \n{}'.format(Y_valid_pred)) valid_loss = precision_score(Y_valid_pred, self.Y_valid.reshape(-1)) else: valid_loss = 0 return train_loss, valid_loss
from pyml.logger import logger logger.debug('test') logger.info('t') def functions(): logger.error('error') functions()
def optimizer(self, X, Y, watch=False): """ 训练一次 """ logger.debug('X : \n{}\nY : {}'.format(X, Y)) cur_Y_pred = self.predict(X) logger.debug('cur_Y_pred : {}'.format(cur_Y_pred)) if self.loss == 'ls': # 计算均方误差,平方和除2(除2是为了与之后的梯度对应) cost = np.square(cur_Y_pred - Y).sum() / 2 # 计算残差 or 计算梯度 d_fx = cur_Y_pred - Y logger.debug('d_fx : {}'.format(d_fx)) # 梯度取负数 d_fx = -d_fx elif self.loss == 'lad': cost = absolute_distance(cur_Y_pred, Y) d_fx = np.sign(cur_Y_pred - Y) d_fx = -d_fx elif self.loss == 'huber': # 计算cost deviation = cur_Y_pred - Y logger.debug('deviation : {}'.format(deviation)) abs_deviation = np.abs(deviation) logger.debug('abs_deviation : {}'.format(abs_deviation)) small_part_index = abs_deviation <= self.delta big_part_index = abs_deviation > self.delta # 取得差小于等于delta的部分 cost = np.square(abs_deviation[small_part_index]).sum() logger.debug('cost : {}'.format(cost)) # 取得差大于delta的部分 cost += self.delta * (abs_deviation[big_part_index] - self.delta / 2).sum() logger.debug('cost : {}'.format(cost)) d_fx = np.zeros((Y.shape)) d_fx[small_part_index] = deviation[small_part_index] logger.debug('d_fx : {}'.format(d_fx)) d_fx[big_part_index] = self.delta * np.sign( deviation[big_part_index]) logger.debug('d_fx : {}'.format(d_fx)) d_fx = -d_fx else: raise NotImplementedError # 计算学习率,这里默认为初始化参数 lr = self.learning_rate self.information['gradient'].append(d_fx) # 创建一个新回归器,去拟合梯度 new_estimator = self.base_estimator( max_node_size=self.max_tree_node_size, divide_way=self.divide_way) new_estimator.fit(X, d_fx) self.parameters['f'].append(new_estimator) self.parameters['lr'].append(lr) return cost
def fit_data(self, sub_X, sub_Y, parent_class): """ sub_X : 2d array-like shape(n_samples, n_features) sub_Y : 1d array-like shape(n_samples ) paranet_class : int 递归过程中,要先记一下父节点的属性,可能有用 """ logger.debug( 'training...\ncurrent id : {}\ncurrent data size : {}'.format( self.id, sub_X.shape[0])) logger.debug('X : \n{}\nY : {}'.format(sub_X, sub_Y) + '\n' + 'parent_class : {}'.format(parent_class)) # 如果此时数据集为空 if len(sub_X) == 0: logger.debug('sub_X is empty ! ') self.set_leaf(parent_class.item()) return # 如果此时分类的Y都是类似的,已经能够确定分类结果 if len((np.unique(sub_Y))) <= 1: logger.debug('sub_Y is all the same ! ') self.set_leaf((sub_Y[0].item())) return # 从sub_Y里面取出现次数最多的,作为该节点的结果 self.current_node_class = np.unique(sub_Y)[np.argmax( np.unique(sub_Y, return_counts=True)[1])] logger.debug('self.current_node_class : {}'.format( self.current_node_class)) if sub_X.shape[0] <= self.max_node_size: logger.debug('sub_X is so small. n_samples : {}'.format( sub_X.shape[0])) self.set_leaf(self.current_node_class) return # 寻找数据集的最佳拆分点,寻找best_feature_name, best_feature_split_name # 并且顺便将数据拆成左右两个分支 best_gini_value = 9999999 best_feature_column = None best_split_point = None logger.debug('in find the best split feature...') for this_feature_index in range(0, len(self.feature_names)): n_samples = sub_X.shape[0] if self.divide_way == 'default': this_feature_values = np.unique(sub_X[:, this_feature_index]) # 获得当前feature最佳split_point的gini指数,并与当前最佳比较 # 得到这一个feature的所有可取的值 for this_feature_value in this_feature_values: # 对可能取到的每一个值,我都要计算以这个值为分割点时的gini指数 if self.column_flags[this_feature_index] == 'continuous': # 如果当前列的flag说明是连续的 # TODO: 似乎不能够取到最右的端点,那如果等于直接跳过吧~ if this_feature_value == np.amax( sub_X[:, this_feature_index]): continue left_branch_Y = sub_Y[ sub_X[:, this_feature_index] <= this_feature_value] right_branch_Y = sub_Y[ sub_X[:, this_feature_index] > this_feature_value] elif self.column_flags[this_feature_index] == 'discrete': # 如果当前列的flag说明是离散的 left_branch_Y = sub_Y[sub_X[:, this_feature_index] == this_feature_value] right_branch_Y = sub_Y[ sub_X[:, this_feature_index] != this_feature_value] this_feature_gini_value = len( left_branch_Y) / n_samples * gini(left_branch_Y) + len( right_branch_Y) / n_samples * gini(right_branch_Y) logger.debug( 'in feature({}:{}) value({}) gini_value({})\nleft_branch_Y : {}\nright_branch_Y : {}' .format(this_feature_index, self.feature_names[this_feature_index], this_feature_value, this_feature_gini_value, left_branch_Y, right_branch_Y)) # 如果以这个值为分割点的gini指数更小,那就更新best参数 if this_feature_gini_value < best_gini_value: best_gini_value = this_feature_gini_value best_feature_column = this_feature_index best_split_point = this_feature_value elif self.divide_way == 'half': # 这一个就是,对于连续数据直接对半分 if self.column_flags[this_feature_index] == 'continuous': this_feature_value = ( np.max(sub_X[:, this_feature_index]) + np.min(sub_X[:, this_feature_index])) / 2 left_branch_Y = sub_Y[ sub_X[:, this_feature_index] <= this_feature_value] right_branch_Y = sub_Y[ sub_X[:, this_feature_index] > this_feature_value] this_feature_gini_value = len( left_branch_Y) / n_samples * gini(left_branch_Y) + len( right_branch_Y) / n_samples * gini(right_branch_Y) logger.debug( 'in feature({}:{}) value({}) gini_value({})\nleft_branch_Y : {}\nright_branch_Y : {}' .format(this_feature_index, self.feature_names[this_feature_index], this_feature_value, this_feature_gini_value, left_branch_Y, right_branch_Y)) # 如果以这个值为分割点的gini指数更小,那就更新best参数 if this_feature_gini_value < best_gini_value: best_gini_value = this_feature_gini_value best_feature_column = this_feature_index best_split_point = this_feature_value elif self.column_flags[this_feature_index] == 'discrete': # 离散数据 this_feature_values = np.unique(sub_X[:, this_feature_index]) for this_feature_value in this_feature_values: n_samples = sub_X.shape[0] left_branch_Y = sub_Y[sub_X[:, this_feature_index] == this_feature_value] right_branch_Y = sub_Y[ sub_X[:, this_feature_index] != this_feature_value] this_feature_gini_value = len( left_branch_Y ) / n_samples * gini(left_branch_Y) + len( right_branch_Y) / n_samples * gini(right_branch_Y) logger.debug( 'in feature({}:{}) value({}) gini_value({})\nleft_branch_Y : {}\nright_branch_Y : {}' .format(this_feature_index, self.feature_names[this_feature_index], this_feature_value, this_feature_gini_value, left_branch_Y, right_branch_Y)) # 如果以这个值为分割点的gini指数更小,那就更新best参数 if this_feature_gini_value < best_gini_value: best_gini_value = this_feature_gini_value best_feature_column = this_feature_index best_split_point = this_feature_value else: raise NotImplementedError self.feature_column = best_feature_column self.split_value = best_split_point logger.debug('get the best split point : {}:{}/{}'.format( self.feature_column, self.feature_names[self.feature_column], self.split_value)) # print('self.feature_column:', self.feature_column) # print('self.split_value', self.split_value) # c=input() # 划分数据集 if self.column_flags[this_feature_index] == 'continuous': # 如果当前列的flag说明是连续的 # 取得最好的数据集拆分方式 self.split_op = '<=' self.split_value = best_split_point best_left_branch_X = sub_X[ sub_X[:, best_feature_column] <= best_split_point, :] best_left_branch_Y = sub_Y[ sub_X[:, best_feature_column] <= best_split_point] best_right_branch_X = sub_X[ sub_X[:, best_feature_column] > best_split_point, :] best_right_branch_Y = sub_Y[ sub_X[:, best_feature_column] > best_split_point] elif self.column_flags[this_feature_index] == 'discrete': # 如果当前列的flag说明是离散的 self.split_op = '==' self.split_value = best_split_point best_left_branch_X = sub_X[sub_X[:, best_feature_column] == best_split_point, :] best_left_branch_Y = sub_Y[sub_X[:, best_feature_column] == best_split_point] best_right_branch_X = sub_X[ sub_X[:, best_feature_column] != best_split_point, :] best_right_branch_Y = sub_Y[ sub_X[:, best_feature_column] != best_split_point] logger.debug('get left branch X : \n{}\nget left branch Y : {}'.format( best_left_branch_X, best_left_branch_Y)) logger.debug( 'get right branch X : \n{}\nget right branch Y : {}'.format( best_right_branch_X, best_right_branch_Y)) self.left_tree = CartTreeClassifierNode( self.feature_names, self.column_flags, max_node_size=self.max_node_size, divide_way=self.divide_way, cost_func=self.cost_func) self.left_tree.fit_data(best_left_branch_X, best_left_branch_Y, self.current_node_class) self.right_tree = CartTreeClassifierNode( self.feature_names, self.column_flags, max_node_size=self.max_node_size, divide_way=self.divide_way, cost_func=self.cost_func) self.right_tree.fit_data(best_right_branch_X, best_right_branch_Y, self.current_node_class)
def fit_data(self, sub_X, sub_Y, parent_class): """ sub_X : 2d array-like shape(n_samples, n_features) sub_Y : 1d array-like shape(n_samples ) paranet_class : int 递归过程中,要先记一下父节点的属性,可能有用 """ # TODO: 一些递归的返回条件 logger.info( 'training...\ncurrent id : {}\ncurrent data size : {}'.format( self.id, sub_X.shape[0])) logger.debug('X : \n{}\nY : {}'.format(sub_X, sub_Y) + '\n' + 'parent_class : {}'.format(parent_class)) # 如果此时数据集为空 if len(sub_X) == 0: logger.debug('sub_X is empty ! ') self.set_leaf(parent_class.item()) return # 从sub_Y里面取均值,作为该节点的结果 self.current_node_value = np.mean(sub_Y).item() logger.debug('self.current_node_value : {}'.format( self.current_node_value)) # TODO: 可能还有其他的返回条件 # 若剩下没有分的样例就只剩下2个了,就不再去细分了,而是直接取这2个点的均值 if sub_X.shape[0] <= self.max_node_size: logger.debug('sub_X is so small. n_samples : {}'.format( sub_X.shape[0])) self.set_leaf(self.current_node_value) return # 寻找数据集的最佳拆分点,寻找best_feature_name, best_feature_split_name # 并且顺便将数据拆成左右两个分支 best_cost_value = 999999999 best_feature_column = None best_split_point = None for this_feature_index in range(0, len(self.feature_names)): this_feature_values = np.unique(sub_X[:, this_feature_index]) # 获得当前feature最佳split_point的gini指数,并与当前最佳比较 # 得到这一个feature的所有可取的值 for this_feature_value in this_feature_values: # 对可能取到的每一个值,我都要计算以这个值为分割点时的gini指数 n_samples = sub_X.shape[0] # 输入数据默认是连续的 # TODO: 似乎不能够取到最右的端点,那如果等于直接跳过吧~ if this_feature_value == np.amax(sub_X[:, this_feature_index]): continue left_branch_Y = sub_Y[ sub_X[:, this_feature_index] <= this_feature_value] right_branch_Y = sub_Y[ sub_X[:, this_feature_index] > this_feature_value] # print(left_branch_Y) # print(right_branch_Y) this_feature_cost_value = len( left_branch_Y) / n_samples * self.cost_func( left_branch_Y) + len( right_branch_Y) / n_samples * self.cost_func( right_branch_Y) # print(this_feature_index, ' ', this_feature_value, ' ', this_feature_cost_value) # c = input() # 如果以这个值为分割点的gini指数更小,那就更新best参数 if this_feature_cost_value < best_cost_value: best_cost_value = this_feature_cost_value best_feature_column = this_feature_index best_split_point = this_feature_value self.feature_column = best_feature_column self.split_value = best_split_point logger.debug('get the best split point : {}:{}/{}'.format( self.feature_column, self.feature_names[self.feature_column], self.split_value)) # print('self.feature_column:', self.feature_column) # print('self.split_value', self.split_value) # c=input() # 划分数据集 # 如果当前列的flag说明是连续的 # 取得最好的数据集拆分方式 self.split_op = '<=' self.split_value = best_split_point best_left_branch_X = sub_X[ sub_X[:, best_feature_column] <= best_split_point, :] best_left_branch_Y = sub_Y[ sub_X[:, best_feature_column] <= best_split_point] best_right_branch_X = sub_X[ sub_X[:, best_feature_column] > best_split_point, :] best_right_branch_Y = sub_Y[ sub_X[:, best_feature_column] > best_split_point] logger.debug('get left branch X : \n{}\nget left branch Y : {}'.format( best_left_branch_X, best_left_branch_Y)) logger.debug( 'get right branch X : \n{}\nget right branch Y : {}'.format( best_right_branch_X, best_right_branch_Y)) self.left_tree = CartTreeRegressionNode( self.feature_names, max_node_size=self.max_node_size, cost_func=self.cost_func) self.left_tree.fit_data(best_left_branch_X, best_left_branch_Y, self.current_node_value) self.right_tree = CartTreeRegressionNode( self.feature_names, max_node_size=self.max_node_size, cost_func=self.cost_func) self.right_tree.fit_data(best_right_branch_X, best_right_branch_Y, self.current_node_value)