def fit_and_valid(self, X, Y, X_valid, Y_valid, watch=False): """ Parameters ------------ X : 2d array-like (n_samples, n_features) Y : 1d array-like (n_samples, ) """ n_features = X.shape[1] n_samples = X.shape[0] parameters = self.initialize_parameters(n_features) w = parameters['w'] b = parameters['b'] X_T = X.T Y_T = Y.reshape((1, -1)) self.init_mini_batches(X_T, Y_T) for i in range(self.num_iterations): self.parameters, grads, cost = self.optimize_single(w, b, X_T, Y_T) w = self.parameters['w'] b = self.parameters['b'] this_loss = self.get_loss(X_valid, Y_valid) train_loss = self.get_loss(X, Y) self.information['test_loss'].append(this_loss) self.information['train_loss'].append(train_loss) self.information['cost'].append(cost) if i % 50 == 0: logger.info( 'train {}/{} current cost: {}, train: {} ,test: {}'. format(i, self.num_iterations, cost, train_loss, this_loss)) self.information['grads'] = grads # self.parameters = parameters return
def fit_and_valid(self, X, Y, X_valid, Y_valid, watch=False): """ Parameters ------------ X : 2d array-like (n_samples, n_features) Y : 1d array-like (n_samples, ) """ n_features = X.shape[1] n_samples = X.shape[0] parameters = self.initialize_parameters(n_features) w = parameters['w'] X_reshaped = np.hstack([np.ones((n_samples, 1)), X]) # shape(n_samples, n_features+1) Y_reshaped = Y.reshape((-1, 1)) # shape(n_samples, n_features+1) # logger.debug('X_valid_reshape : {}'.format(X_valid_reshaped.shape)) # shape(n_samples, 1) # print('train {}/{} current cost : {}'.format(i,self.n_estimators,cost)) for i in range(self.num_iterations): self.parameters, grads, cost = self.optimize_single( w, X_reshaped, Y_reshaped) w = self.parameters['w'] this_loss = self.get_loss(X_valid, Y_valid) train_loss = self.get_loss(X, Y) self.information['test_loss'].append(this_loss) self.information['train_loss'].append(train_loss) logger.info( 'train {}/{} current cost: {}, train: {} ,test: {}'.format( i, self.num_iterations, cost, train_loss, this_loss)) self.information['grads'] = grads # self.parameters = parameters return
def fit_and_valid(self, X, Y, X_valid, Y_valid, watch=False): self.init_mini_batches(X, Y) logger.debug('X : \n{} Y : {}'.format(X, Y)) init_estimator = self.base_estimator( max_node_size=self.max_tree_node_size, divide_way=self.base_divide_way) init_estimator.fit(X, Y) self.parameters['f'].append(init_estimator) self.parameters['lr'].append(1) for i in range(self.n_estimators): if self.mini_batch == 0: # 使用全部样例: cost = self.optimizer(X, Y) else: # 使用mini_batch个样例: X_batch, Y_batch = self.get_mini_batch() cost = self.optimizer(X_batch, Y_batch) if i % self.print_interval == 0: this_loss = self.get_test_cost(X_valid, Y_valid) train_loss = self.get_test_cost(X, Y) self.information['valid_loss'].append(this_loss) self.information['train_loss'].append(train_loss) logger.info( 'train {}/{} current cost: {},train : {} valid : {}'. format(i, self.n_estimators, cost, train_loss, this_loss)) # print('train {}/{} current cost : {}'.format(i,self.n_estimators,cost)) self.update_mini_batch()
def train(self): for i in range(self.num_iterations): cost, train_loss, valid_loss = self.train_one() if i % self.print_intervel == 0: logger.info( 'train {}/{} current cost: {}, train: {} ,valid: {}'. format(i, self.num_iterations, cost, train_loss, valid_loss))
def fit(self, X, Y, watch=False): logger.debug('X : \n{} Y : {}'.format(X, Y)) init_estimator = self.base_estimator( max_node_size=self.max_tree_node_size, divide_way=self.base_divide_way) init_estimator.fit(X, Y) self.parameters['f'].append(init_estimator) self.parameters['lr'].append(1) for i in range(self.n_estimators): cost = self.optimizer(X, Y) if i % self.print_interval == 0: logger.info('train {}/{} current cost: {}'.format( i, self.n_estimators, cost))
def fit_data(self, sub_X, sub_Y, parent_class): """ sub_X : 2d array-like shape(n_samples, n_features) sub_Y : 1d array-like shape(n_samples ) paranet_class : int 递归过程中,要先记一下父节点的属性,可能有用 """ # TODO: 一些递归的返回条件 logger.info( 'training...\ncurrent id : {}\ncurrent data size : {}'.format( self.id, sub_X.shape[0])) logger.debug('X : \n{}\nY : {}'.format(sub_X, sub_Y) + '\n' + 'parent_class : {}'.format(parent_class)) # 如果此时数据集为空 if len(sub_X) == 0: logger.debug('sub_X is empty ! ') self.set_leaf(parent_class.item()) return # 从sub_Y里面取均值,作为该节点的结果 self.current_node_value = np.mean(sub_Y).item() logger.debug('self.current_node_value : {}'.format( self.current_node_value)) # TODO: 可能还有其他的返回条件 # 若剩下没有分的样例就只剩下2个了,就不再去细分了,而是直接取这2个点的均值 if sub_X.shape[0] <= self.max_node_size: logger.debug('sub_X is so small. n_samples : {}'.format( sub_X.shape[0])) self.set_leaf(self.current_node_value) return # 寻找数据集的最佳拆分点,寻找best_feature_name, best_feature_split_name # 并且顺便将数据拆成左右两个分支 best_cost_value = 999999999 best_feature_column = None best_split_point = None for this_feature_index in range(0, len(self.feature_names)): this_feature_values = np.unique(sub_X[:, this_feature_index]) # 获得当前feature最佳split_point的gini指数,并与当前最佳比较 # 得到这一个feature的所有可取的值 for this_feature_value in this_feature_values: # 对可能取到的每一个值,我都要计算以这个值为分割点时的gini指数 n_samples = sub_X.shape[0] # 输入数据默认是连续的 # TODO: 似乎不能够取到最右的端点,那如果等于直接跳过吧~ if this_feature_value == np.amax(sub_X[:, this_feature_index]): continue left_branch_Y = sub_Y[ sub_X[:, this_feature_index] <= this_feature_value] right_branch_Y = sub_Y[ sub_X[:, this_feature_index] > this_feature_value] # print(left_branch_Y) # print(right_branch_Y) this_feature_cost_value = len( left_branch_Y) / n_samples * self.cost_func( left_branch_Y) + len( right_branch_Y) / n_samples * self.cost_func( right_branch_Y) # print(this_feature_index, ' ', this_feature_value, ' ', this_feature_cost_value) # c = input() # 如果以这个值为分割点的gini指数更小,那就更新best参数 if this_feature_cost_value < best_cost_value: best_cost_value = this_feature_cost_value best_feature_column = this_feature_index best_split_point = this_feature_value self.feature_column = best_feature_column self.split_value = best_split_point logger.debug('get the best split point : {}:{}/{}'.format( self.feature_column, self.feature_names[self.feature_column], self.split_value)) # print('self.feature_column:', self.feature_column) # print('self.split_value', self.split_value) # c=input() # 划分数据集 # 如果当前列的flag说明是连续的 # 取得最好的数据集拆分方式 self.split_op = '<=' self.split_value = best_split_point best_left_branch_X = sub_X[ sub_X[:, best_feature_column] <= best_split_point, :] best_left_branch_Y = sub_Y[ sub_X[:, best_feature_column] <= best_split_point] best_right_branch_X = sub_X[ sub_X[:, best_feature_column] > best_split_point, :] best_right_branch_Y = sub_Y[ sub_X[:, best_feature_column] > best_split_point] logger.debug('get left branch X : \n{}\nget left branch Y : {}'.format( best_left_branch_X, best_left_branch_Y)) logger.debug( 'get right branch X : \n{}\nget right branch Y : {}'.format( best_right_branch_X, best_right_branch_Y)) self.left_tree = CartTreeRegressionNode( self.feature_names, max_node_size=self.max_node_size, cost_func=self.cost_func) self.left_tree.fit_data(best_left_branch_X, best_left_branch_Y, self.current_node_value) self.right_tree = CartTreeRegressionNode( self.feature_names, max_node_size=self.max_node_size, cost_func=self.cost_func) self.right_tree.fit_data(best_right_branch_X, best_right_branch_Y, self.current_node_value)
train_Y = train_ori_Y.values # # 交叉验证 logger.setLevel(logging.INFO) n_splits = 5 cv = ShuffleSplit(n_splits=n_splits) for train_indices, test_indices in cv.split(train_X): # lr = GradientBoostingRegression(learning_rate=0.1, n_estimators=100, max_tree_node_size=400) lr = DecisionTreeRegressor(max_node_size=1000) # lr.fit(train_X[train_indices], train_Y[train_indices], watch=True) lr.fit(train_X[train_indices], train_Y[train_indices]) y_pred = lr.predict(train_X[test_indices]) logger.info(pearson_correlation(y_pred, train_Y[test_indices])) # # 训练模型写入结果 lr = DecisionTreeRegressor(max_node_size=1000) lr.fit(train_X, train_Y) y_pred = lr.predict(test_X) sub = pd.DataFrame(y_pred) sub.to_csv('./results/' + 'CART-m1000-no_weight-add_tag_feat' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + ".csv", index=0, header=None, index_label=None) # # 记录提交结果
logger.setLevel(logging.INFO) n_splits = 2 k_splits = 5 # cv = ShuffleSplit(n_splits=n_splits) cv = KFold(k_splits=k_splits) score = 0 models= [] for train_indices, test_indices in cv.split(train_X): lr = GradientBoostingRegression(loss='lad', learning_rate=0.05, n_estimators=100, max_tree_node_size=50) # lr.fit(train_X[train_indices], train_Y[train_indices], watch=True) lr.fit_and_valid(train_X[train_indices], train_Y[train_indices],train_X[test_indices],train_Y[test_indices], mini_batch=4000 , watch=True) y_pred = lr.predict(train_X[test_indices]) this_score = pearson_correlation(y_pred, train_Y[test_indices]) score += this_score logger.info(this_score) models.append(lr) logger.info('score : {}'.format(score/k_splits)) i = lr plt.plot(range(len(i.information['test_loss'])),i.information['test_loss'],label='test', ) plt.legend() for i in models: plt.plot(range(len(i.information['test_loss'])),i.information['test_loss'],label='test', ) plt.legend() # # 训练模型写入结果
from pyml.logger import logger logger.debug('test') logger.info('t') def functions(): logger.error('error') functions()