def assess(self, y, pred_y, return_dist=False, check_input=True): '''\n Function: 使用输入的观测值和预测值进行模型评估 Notes: 注意数据集的数据类型,分类首选类型str,回归首选类型float64, 拟合时数据集采用非首选类型可能会导致此处类型不匹配,建议提前转换 Parameters ---------- y: 观测值,ndarray类型 pred_y: 预测值,ndarray类型 return_dist: 是否返回预测分布,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 分类->准确率,回归->R方,float类型 ------- ''' check_type('check_input', type(check_input), type(True)) if check_input == True: check_type('return_dist', type(return_dist), type(True)) y = self.check_input_y_(y, 'y', transform=False) pred_y = self.check_input_y_(pred_y, 'pred_y', transform=False) check_index_match(y, pred_y, 'y', 'pred_y', only_len=True) if self.mode == 'c': return stats.accuracy(y, pred_y, return_dist, self.classes) else: return stats.r_sqr(y, pred_y)
def check_input_y_(self, y, name='y'): ''' return 0: 转换为str的y,Series类型 ''' check_type(name, type(y), type(pd.Series())) return y.astype('str')
def assess(self, y, p_y, return_dist=False, check_input=True): '''\n Function: 使用输入的观测值和预测值进行模型评估 Notes: 注意数据集的数据类型,分类首选类型str,回归首选类型float64, 拟合时数据集采用非首选类型可能会导致此处类型不匹配,建议提前转换 Parameters ---------- y:观测值,Series类型 p_y:预测值,Series类型 return_dist: 是否返回预测分布,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 分类->准确率,回归->R方,float类型 ------- ''' mode = self.mode #校验输入 check_type('check_input', type(check_input), type(True)) if check_input == True: y = self.check_input_y_(y, name='y', mode=mode) p_y = self.check_input_y_(p_y, name='p_y', mode=mode) check_index_match(y, p_y, 'y', 'p_y') #分类模式求准确率,回归模式求R2 if mode == 'c': return stats.accuracy(y, p_y, return_dist, self.classes) elif mode == 'r': return stats.r_sqr(y, p_y)
def check_input_y_(self, y, name='y', transform=True): #类型校验 check_type(name, type(y), type(np.array(0))) if self.mode == 'c': y = y.astype('str') if transform == False: return y #对目标向量进行one-hot编码 #(单列离散变量转换为多列01变量) if self.mode == 'c': if len(y.shape) == 1: Y, classes = dp.dummy_var(y) else: Y, classes = y, [i for i in range(y.shape[1])] else: Y, classes = y.reshape((-1, 1)), [] #shape调整 Y_D, output_D = len(Y.shape), len(self.output_shape) output_size = self.output_size error_info = 'The shape of ' + name + ' does not match to output_shape' #(m,k)->(1,m*k) if Y_D == output_D: if Y.shape == self.output_shape: Y = Y.reshape((1, output_size)) else: raise ValueError(error_info) #(n,m,k)->(n,m*k) elif Y_D == 1 + output_D: if Y.shape[1:] == self.output_shape: Y = Y.reshape((-1, output_size)) else: raise ValueError(error_info) else: raise ValueError(error_info) return Y, classes
def check_input_t_(self, theta): if type(theta) == type(pd.Series()): theta = theta.to_frame() check_type('theta', type(theta), type(pd.DataFrame())) type_list = [np.float64] for i in range(len(theta.columns)): check_type('theta', theta.dtypes[i], type_list)
def access(self, y, p_y, return_dist=False): check_type('return_dist', type(return_dist), type(True)) y = self.check_input_y_(y) p_y = self.check_input_y_(p_y, 'p_y') check_index_match(y, p_y, 'y', 'p_y') classes = self.y_d_p['value'].values return stats.accuracy(y, p_y, return_dist, classes)
def assess(self,y,p_y,mode=None): '''\n Function: 使用输入的观测值和预测值进行模型评估 Notes: 注意数据集的数据类型,分类首选类型str,回归首选类型float64, 拟合时数据集采用非首选类型可能会导致此处类型不匹配,建议提前转换 Parameters ---------- y:观测值,Series类型 p_y:预测值,Series类型 mode:模式,str类型,默认使用内部集成单元的属性, 'c'->分类,'r'->回归 ---------- Returns ------- 0: 分类->准确率,回归->R方,float类型 ------- ''' #校验参数 if type(mode)==type(None): mode=self.units[0].tree.mode check_type('mode',type(mode),type('')) mode_list=['c','r'] check_limit('mode',mode in mode_list,str(mode_list)) y,continuity_y=self.unit_test.check_input_y_(y,name='y') p_y,continuity_p_y=self.unit_test.check_input_y_(p_y,name='p_y') check_index_match(y,p_y,'y','p_y') #分类模式求准确率,回归模式求R2 if mode=='c': return stats.accuracy(y,p_y) elif mode=='r': return stats.r_sqr(y,p_y)
def check_input_y_(self, y, name='y', mode='c'): check_type(name, type(y), type(pd.Series())) if mode == 'r': type_list = [np.int64, np.float64] check_type(name, y.dtype, type_list) return y elif mode == 'c': return y.astype('str')
def check_input_X_(self, X): if type(X) == type(pd.Series()): X = X.to_frame() check_type('X', type(X), type(pd.DataFrame())) type_list = [np.int64, np.float64] for i in range(len(X.columns)): check_type('column %d in X' % i, X.dtypes[i], type_list) return X
def assess(self, y, p_y, theta=None, detailed=False, check_input=True): '''\n Function: 执行模型评估 Note: 拟合后关于训练集的r2和cost已保存在内部属性中, 通过.score和.cost查看 Parameters ---------- y: 观测值向量,Series类型 p_y: 预测值向量,Series类型 detailed: 是否返回详细评估,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: r2或评估结果表,float类型或Series类型 ------- ''' #外部传入theta或使用内部缓存 if type(theta) == type(None): theta = self.theta #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: check_type('detailed', type(detailed), type(True)) self.check_input_y_(y) self.check_input_y_(p_y, 'p_y') check_index_match(y, p_y, 'y', 'p_y') #r2计算 r_sqr = stats.r_sqr(y, p_y) #是否进行详细评估 if detailed == False: return r_sqr else: k, n = len(theta), len(y) cost = self.cost_(y, p_y) #计算调整r2和代价值 adj_r_sqr = stats.adj_r_sqr(r_sqr, n, k) a_result = [] #f_value=self.f_test(p_y,y,len(x),len(theta)) a_result.append(('r_sqr', r_sqr)) a_result.append(('adj_r_sqr', adj_r_sqr)) a_result.append(('cost', cost)) a_result = pd.DataFrame(a_result, columns=['index', 'value']) a_result = a_result.set_index('index').iloc[:, 0] return a_result
def fit(self, X, y, test_X=None, test_y=None, show_time=False, monitor_cost=False, monitor_score=False, check_input=True): '''\n Function: 使用输入数据拟合神经网络 Note: 输入数据必须全部是连续数值类型,其他类型自行预处理 Parameters ---------- X: 特征矩阵,ndarray(samples_n,input_shape)<float64,int64>类型 y: 目标向量,ndarray(samples_n,)<str,float64,int64>类型 test_X: 测试特征矩阵,ndarray(samples_n,input_shape)<float64,int64>类型 test_y: 测试目标向量,ndarray(samples_n,)<str,float64,int64>类型 show_time: 是否显示时间开销,bool类型,默认False monitor_cost: 监控cost变化,bool类型,默认值False monitor_score: 监控score变化,bool类型,默认值False check_input: 是否进行输入校验,bool类型,默认值True ---------- ''' start = time.clock() #输入校验 start1 = time.clock() check_type('check_input', type(check_input), type(True)) if check_input == True: check_type('show_time', type(show_time), type(True)) check_type('monitor_cost', type(monitor_cost), type(True)) check_type('monitor_score', type(monitor_score), type(True)) X = self.check_input_X_(X) y, self.classes = self.check_input_y_(y) if (len(self.classes) < 2) & (self.mode == 'c'): raise ValueError('too few classes,should >1') check_index_match(X, y, 'X', 'y', only_len=True) if type(test_X) != type(None): test_X = self.check_input_X_(test_X, name='test_X') test_y, test_classes = self.check_input_y_(test_y, name='test_y') check_index_match(test_X, test_y, 'test_X', 'test_y', only_len=True) self.time_cost['input check'] += time.clock() - start1 #优化 self.optimize_(X, y, test_X, test_y, monitor_cost, monitor_score) if show_time == True: print('\ntime used for training: %f' % (time.clock() - start)) self.time_cost['Total'] += time.clock() - start
def check_input_y_(self, y, name='y', transform=True): #类型校验 check_type(name, type(y), type(np.array(0))) #分类模式强制转换为字符串标签 if self.mode == 'c': y = y.astype('str') #是否进行进一步转换 if transform == False: return y #分类模式对目标向量进行one-hot编码 #(单列离散变量转换为多列01变量) if self.mode == 'c': if len(y.shape) == 1: Y, classes = dp.dummy_var(y) else: Y, classes = y, [i for i in range(y.shape[1])] else: Y, classes = y.reshape((-1, 1)), [] return Y, y, classes
def __init__(self,mode='c',units_n=10,units_type='cart', depth_max=None,split_sample_n=2,leaf_sample_n=1, features_use='sqrt',features_reuse=True): #校验参数类型和取值 #check_type(变量名,变量类型,要求类型) #check_limit(变量名,限制条件,正确取值提示) check_type('mode',type(mode),type('')) mode_list=['c','r'] mode=mode.lower() check_limit('mode',mode in mode_list,str(mode_list)) check_type('units_n',type(units_n),type(0)) check_limit('units_n',units_n>=1,'value>=1') check_type('units_type',type(units_type),type('')) type_list=['id3','c4.5','cart'] units_type=units_type.lower() check_limit('units_type',units_type in type_list,str(type_list)) #保存参数 self.unit_test=dt.DecisionTree(mode=mode,model_type=units_type,depth_max=depth_max, split_sample_n=split_sample_n,leaf_sample_n=leaf_sample_n, features_use=features_use,features_reuse=features_reuse) self.mode=mode self.units_n=units_n self.units_type=units_type self.depth_max=depth_max self.split_sample_n=split_sample_n self.leaf_sample_n=leaf_sample_n self.features_use=features_use self.features_reuse=features_reuse
def predict(self, X, theta=None, show_time=False, check_input=True): '''\n Function: 对输入数据进行预测 Note: theta参数不提供时直接使用内部存储 Parameters ---------- X: 特征矩阵,DataFrame类型 theta: 参数向量,Series类型 show_time: 是否显示时间开销,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 预测值向量,Series类型 ------- ''' start = time.clock() #外部传入theta或使用内部缓存 if type(theta) == type(None): theta = self.theta #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: X = self.check_input_X_(X) self.check_input_t_(theta) check_items_match(X.columns, theta, 'features in X', 'theta', 'numbers', mode='len') #预测 p_y = pd.Series(self.linear_(X, theta), index=X.index) time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return p_y
def assess(self,y,p_y,mode=None): '''\n Function: 使用输入的观测值和预测值进行模型评估 Notes: 注意数据集的数据类型,分类首选类型str,回归首选类型float64, 拟合时数据集采用非首选类型可能会导致此处类型不匹配,建议提前转换 Parameters ---------- y:观测值,Series类型 p_y:预测值,Series类型 mode:模式,str类型,默认使用内部集成单元的属性, 'c'->分类,'r'->回归 ---------- Returns ------- 0: 分类->准确率,回归->R方,float类型 ------- ''' #校验参数 if type(mode)==type(None): mode=self.mode check_type('mode',type(mode),type('')) mode_list=['c','r'] check_limit('mode',mode in mode_list,str(mode_list)) check_index_match(y,p_y,'y','p_y') #分类模式求准确率,回归模式求R2 if mode=='c': return stats.accuracy(y.astype('str'),p_y.astype('str')) elif mode=='r': r_sqr=stats.r_sqr(y,p_y) if r_sqr<0: print('warning: R2 is less than 0, which means bad fitting,'+ '\ntry to reduce the learning rate') return r_sqr
def check_input_X_(self, X, name='X'): #类型校验 check_type(name, type(X), type(np.array(0))) type_list = [np.int64, np.float64] check_type(name, X.dtype, type_list) #shape调整 X_D, input_D = len(X.shape), len(self.input_shape) input_size = self.input_size error_info = 'The shape of ' + name + ' does not match to input_shape' #(m,k)->(1,m*k) if X_D == input_D: if X.shape == self.input_shape: X = X.reshape((1, input_size)) else: raise ValueError(error_info) #(n,m,k)->(n,m*k) elif X_D == 1 + input_D: if X.shape[1:] == self.input_shape: X = X.reshape((-1, input_size)) else: raise ValueError(error_info) else: raise ValueError(error_info) return X
def bind_func_(self, activation, softmax, optimizer, mode): #参数类型校验 check_type('activation', type(activation), [type(''), type(())]) check_type('softmax', type(softmax), type(True)) check_type('optimizer', type(optimizer), type('')) #参数值校验 activation_list = ['sigm', 'tanh', 'relu'] check_limit('activation', activation in activation_list, str(activation_list)) optimizer_list = ['sgd', 'magd', 'nagd', 'adam'] check_limit('optimizer', optimizer in optimizer_list, str(optimizer_list)) #绑定函数 #隐含层激活函数 if activation == 'sigm': self.activation_ = self.sigmoid_ elif activation == 'tanh': self.activation_ = self.tanh_ elif activation == 'relu': self.activation_ = self.relu_ else: raise ValueError('Unknown activation function') self.activation = activation #输出层激活函数和代价函数 if mode == 'c': if softmax == False: self.cost = 'ce' self.output_activation = 'sigm' self.cost_ = self.cross_ent_ self.output_activation_ = self.sigmoid_ else: self.cost = 'log' self.output_activation = 'soft' self.cost_ = self.log_like_ self.output_activation_ = self.softmax_ elif mode == 'r': self.cost = 'mse' self.output_activation = 'none' self.cost_ = self.mean_sqr_err_ self.output_activation_ = self.identity_ else: raise ValueError('Unknown mode') self.softmax = softmax #优化器 if optimizer == 'sgd': self.optimizer_ = self.sgd_ elif optimizer == 'magd': self.optimizer_ = self.momentum_ elif optimizer == 'nagd': self.optimizer_ = self.nesterov_ elif optimizer == 'adam': self.optimizer_ = self.adam_ else: raise ValueError('Unknown optimizer') self.optimizer = optimizer
def __init__(self,mode='c',units_type='cart',iter_max=10,depth_max=0, learning_rate=1.0): #校验参数类型和取值 check_type('mode',type(mode),type('')) type_list=['r','c'] mode=mode.lower() check_limit('mode',mode in type_list,str(type_list)) check_type('units_type',type(units_type),type('')) type_list=['cart'] units_type=units_type.lower() check_limit('units_type',units_type in type_list,str(type_list)) check_type('iter_max',type(iter_max),type(0)) check_limit('iter_max',iter_max>=1,'value>=1') check_type('learning_rate',type(learning_rate),type(0.0)) check_limit('learning_rate',learning_rate>0.0,'value>0.0') check_limit('learning_rate',learning_rate<=1.0,'value<=1.0') if type(depth_max)==type(0): if depth_max==0: if mode=='r': depth_max=3 elif mode=='c': depth_max=3 #保存参数 #注:此处depth_max参考了sklearn,尝试过回归也用depth_max=1,效果很糟糕 self.unit_test=dt.DecisionTree(mode='r',model_type='cart', depth_max=depth_max) self.mode=mode self.units_type='cart' self.units_mode='r' self.iter_max=iter_max self.depth_max=depth_max self.learning_rate=learning_rate
def predict(self, X, return_a=False, show_time=False, check_input=True): '''\n Function: 对输入数据进行预测 Parameters ---------- X: 特征矩阵,ndarray类型 return_a: 是否返回输出层激活值,bool类型,默认False show_time: 是否显示时间开销,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 预测值向量,ndarray类型 ------- ''' start = time.clock() #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: check_type('show_time', type(show_time), type(True)) check_type('return_a', type(return_a), type(True)) X = self.check_input_X_(X) #前向传播 a = self.forward_prop_(X) #整合结果 if self.mode == 'c': if return_a == False: output = self.prob_to_label_(a, self.classes) else: output = a.reshape((-1, ) + self.output_shape) else: if return_a == False: output = a.reshape((-1, )) else: output = a if show_time == True: print('\ntime used for predict: %f' % (time.clock() - start)) if output.shape[0] == 1: output = output[0] return output
def predict(self, X, theta=None, classes=None, classes_paired=None, return_proba=False, show_time=False, check_input=True): '''\n Function: 对输入数据进行预测 Note: theta,classes,classes_paired参数不提供时直接使用内部存储 Parameters ---------- X: 特征矩阵,DataFrame类型 theta: 参数向量,Series类型 classes: 类标签,list(str)类型 classes_paired: 正负样本类选取,narray(m,n)类型 return_proba: 是否返回分类概率,bool类型,默认False show_time: 是否显示时间开销,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 预测值向量,Series类型 ------- ''' start = time.clock() #外部传入参数或使用内部缓存 if type(theta) == type(None): theta = self.theta if type(classes) == type(None): classes = self.classes if type(classes_paired) == type(None): classes_paired = self.classes_paired #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: X = self.check_input_X_(X) self.check_input_t_(theta) check_items_match(X.columns, theta, 'features in X', 'theta', 'numbers', mode='len') #预测 p_y = self.predict_(X, theta, classes_paired, return_proba) if return_proba == False: p_y = pd.Series(p_y, name='classify', index=X.index) for i in range(len(classes)): p_y[p_y == i] = classes[i] time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return p_y else: time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return pd.DataFrame(p_y, columns=classes, index=X.index)
def fit(self, X, y, output=False, show_time=False, check_input=True): '''\n Function: 使用输入数据拟合逻辑回归 Note: 逻辑回归的特征输入为连续型数值,分类输出为离散标签 Parameters ---------- X: 特征矩阵,DataFrame类型 y: 观测值向量,Series类型 output: 是否返回求解的参数向量,bool类型,默认False show_time: 是否显示时间开销,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 返回求解的参数向量,Series类型 ------- ''' start = time.clock() #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: check_type('output', type(output), type(True)) X = self.check_input_X_(X) y = self.check_input_y_(y) check_index_match(X, y, 'X', 'y') #判断类别数量 values = y.sort_values().drop_duplicates().tolist() features_n, classes_n = len(X.columns), len(values) if classes_n <= 1: raise ValueError('classes_n in y should >=2') if classes_n >= 0.5 * len(y): print('\nwarning: too many classes in y') self.classes = values #归一化校验 range_ = X.iloc[:, 1:].max() - X.iloc[:, 1:].min() if (range_.max() < 1.1) & (range_.min() > 0.9): if self.learning_rate < 0.1: print( '\nit is recommended to change learning_rate over 0.1 for scaled X' ) else: print('\nit is recommended to scale X') #将单列的多类别分类值转换为多列的01类别判断,索引(记录,类)->属于该类 Y = dp.dummy_var(y) theta_h, cost_h = [], [] #多分类模式ovr if self.multi_class == 'ovr': theta = np.zeros((features_n, classes_n)) cost_min = np.zeros(classes_n) cost = np.zeros(classes_n) for i in range(classes_n): print('\nfitting classifier %d ---' % i) theta_, theta_h_, cost_min_, cost_h_ = self.fit_by_sgd_( X, Y.iloc[:, i]) theta[:, i], cost_min[i], cost[ i] = theta_, cost_min_, cost_h_.iloc[-1] theta_h.append(theta_h_) cost_h.append(cost_h_) self.classes_paired = None #多分类模式ovo elif self.multi_class == 'ovo': #正负样本选取矩阵,索引(组合,类)->取用 class_p, class_n = dp.combine_enum_paired(list(range(classes_n))) #应用正负样本选取矩阵后的分类情况,索引(记录,组合)->分类判断 #1->正样本分类,0->负样本分类,0.5->无法判别 Y_ = (np.dot(Y, class_p.T) - np.dot(Y, class_n.T) + 1.0) / 2.0 Y_ = pd.DataFrame(Y_, index=Y.index) combines_n = len(Y_.columns) theta = np.zeros((features_n, combines_n)) cost_min = np.zeros(combines_n) cost = np.zeros(combines_n) for i in range(combines_n): print('\nfitting classifier %d ---' % i) theta_, theta_h_, cost_min_, cost_h_ = self.fit_by_sgd_( X, Y_.iloc[:, i]) theta[:, i], cost_min[i], cost[ i] = theta_, cost_min_, cost_h_.iloc[-1] theta_h.append(theta_h_) cost_h.append(cost_h_) self.classes_paired = class_p - class_n theta = pd.DataFrame(theta) cost_min = pd.Series(cost_min) cost = pd.Series(cost) self.theta = theta self.theta_h = theta_h self.cost_min = cost_min self.cost_h = cost_h self.cost = cost p_y = self.predict(X, check_input=False) self.score = self.assess(y, p_y, check_input=False) time_cost = time.clock() - start if show_time == True: print('\ntime used for training: %f' % time_cost) #返回求得的参数 if output == True: return theta
def selection(self,test_X,test_y,units=None,units_oob_score=None, use='oob',return_units=False,show_time=False): '''\n Function: 在生成好的模型上进行选择,筛选出集成单元的一个子集 Notes: 作用类似于决策树的剪枝,通过一些规则生成可选子集, 再通过在测试集上的表现选择最优的一个,能够得到更简单且泛化能力更强的模型 Parameters ---------- test_X: 测试集特征列,DataFrame类型 test_y: 测试集目标列,Series类型 units: 集成单元,list(DecitionTree)类型 units_oob_score: 集成单元obb评分,list(float)类型 use: 使用的选择方法,str类型,默认'oob' 'rd'->随机选择,'oob'->oob选择 return_units: 是否以返回值形式给到选择后的集成单元,bool类型,默认False show_time: 是否显示耗时,bool类型,默认值False ---------- Returns ------- 0: 分类->准确率,回归->R方,float类型 ------- ''' start = time.clock() if units==None: units=self.units if units_oob_score==None: units_oob_score=self.units_oob_score #输入校验 check_type('units',type(units),type([])) check_type('element in units',type(units[0]),type(dt.DecisionTree())) check_type('units_oob_score',type(units_oob_score),type([])) check_type('element in units_oob_score',type(units_oob_score[0]),[type(0.0),np.float64]) check_type('use',type(use),type('')) check_type('return_units',type(return_units),type(True)) use_list=['rd','oob'] check_limit('use',use in use_list,str(use_list)) test_X,continuity_X=self.unit_test.check_input_X_(test_X,'test_X') test_y,continuity_y=self.unit_test.check_input_y_(test_y,'test_y') check_index_match(test_X,test_y,'test_X','test_y') features=[] for unit in units: features+=unit.tree.features features=list(set(features)) check_items_match(test_X.columns,features,'test_X','tree','features',mode='right') #选择 if use=='rd': subset=self.random_selection_(test_X,test_y,units) elif use=='oob': subset=self.oob_selection_(test_X,test_y,units,units_oob_score) end = time.clock() if show_time==True: print('\ntime used for selection:%f'%(end-start)) if return_units==False: self.units=subset else: return subset
def __init__(self, fit_mode='sgd', multi_class='ovo', learning_rate=0.001, iter_max=1000, mini_batch=0, L2_n=0.0, early_stop=True): #校验参数类型和取值 #check_type(变量名,变量类型,要求类型) #check_limit(变量名,限制条件,正确取值提示) check_type('fit_mode', type(fit_mode), type('')) check_type('multi_class', type(multi_class), type('')) check_type('learning_rate', type(learning_rate), type(0.0)) check_type('iter_max', type(iter_max), type(0)) check_type('mini_batch', type(mini_batch), type(0)) check_type('L2_n', type(L2_n), type(0.0)) check_type('early_stop', type(early_stop), type(True)) fit_mode = fit_mode.lower() mode_list, mode_list2 = ['sgd'], ['ovr', 'ovo'] check_limit('fit_mode', fit_mode in mode_list, str(mode_list)) check_limit('multi_class', multi_class in mode_list2, str(mode_list2)) check_limit('learning_rate', learning_rate > 0.0, 'value>0.0') check_limit('iter_max', iter_max > 0, 'value>0') check_limit('mini_batch', mini_batch >= 0, 'value>=0') check_limit('L2_n', L2_n >= 0.0, 'value>=0.0') self.fit_mode = fit_mode self.learning_rate = learning_rate self.iter_max = iter_max self.mini_batch = mini_batch self.L2_n = L2_n self.early_stop = early_stop self.multi_class = multi_class
def fit(self,X,y,show_time=False): '''\n Function: 使用输入数据拟合自适应提升(决策树) Note: 数据列的连续性会进行自动判断,不被支持的类型需要预处理 (int64,float64)->连续 (bool,category,object)->离散 所有离散数据会强制转换为str标签 Description: 对于m=1,2,…,M (a)使用具有权值分布Dm的训练数据集进行学习,得到弱学习器Gm(x) (b)计算Gm(x)在训练数据集上的误差率 (c)计算Gm(x)在强学习器中所占的权重: (d)更新训练数据集的权值分布(需要归一化,使样本的概率分布和为1) Parameters ---------- X: 特征列,DataFrame类型 y: 目标列,Series类型 show_time: 是否显示耗时,bool类型,默认值False ---------- ''' start = time.clock() check_type('show_time',type(show_time),type(True)) #校验X,y输入 X,self.continuity_X,self.mapping_X,X0=\ self.unit_test.check_input_X_(X,to_index=True,return_source=True) y,self.continuity_y,self.mapping_y,y0=\ self.unit_test.check_input_y_(y,to_index=True,return_source=True) #校验X,y输入是否匹配 check_index_match(X,y,'X','y') feature_use_n=len(X.columns) #特征/分类标签 self.features=X.columns.tolist() if self.mode=='c': self.classes=y0.drop_duplicates().sort_values().astype('str').tolist() k=len(self.classes) elif self.mode=='r': k=0 #迭代训练弱学习器 sample_weight=np.ones(len(X)) sample_weight=pd.Series(sample_weight/len(sample_weight),index=X.index) self.units,self.units_weight,self.units_error,self.fit_h=[],[],[],[] for i in range(self.iter_max): if show_time==True: print('\nfitting with unit %d ---'%i) #构建并拟合模型 unit=dt.DecisionTree(mode=self.mode,model_type=self.units_type, depth_max=self.depth_max) unit.continuity_X,unit.mapping_X=self.continuity_X,self.mapping_X unit.continuity_y,unit.mapping_y=self.continuity_y,self.mapping_y unit.features_use_n=feature_use_n unit.fit(X,y,sample_weight,show_time=show_time,check_input=False) #计算当前弱学习器加权误差和预测器权重 mode=unit.tree.mode p_y=unit.predict(X0,check_input=False) fit_h_=pd.DataFrame() fit_h_['y'],fit_h_['p_y'],fit_h_['sp_wgt']=y0,p_y,sample_weight self.fit_h.append(fit_h_) errors=self.errors_(y0,p_y,mode) wgt_err=self.wgt_err_(errors,sample_weight,mode) error=self.wgt_err_(errors,self.fit_h[0]['sp_wgt'],mode) #误差达到0,不需要继续训练 if wgt_err==0.0: if show_time==True: print('\nwarning: early stopping') break unit_weight=self.unit_weight_(self.learning_rate,wgt_err,k,mode) #权重大于0表示弱学习器优于随即猜测 if unit_weight>0: self.units_weight.append(unit_weight) self.units_error.append([error,wgt_err]) #添加进强学习器 unit.continuity_X,unit.mapping_X=None,None unit.continuity_y,unit.mapping_y=None,None self.units.append(unit) #更新样本权重 if i<self.iter_max-1: sample_weight=self.sample_weight_(errors,unit_weight,sample_weight,mode) sample_weight=pd.Series(sample_weight,index=X.index) else: if show_time==True: print('\nwarning: unit is worse than random, discard') self.units_error=pd.DataFrame(self.units_error,columns=['err','wgt_err']) end = time.clock() if show_time==True: print('\ntotal time used for trainning: %f'%(end-start))
def fit(self, X, y, output=False, show_time=False, check_input=True): '''\n Function: 使用输入数据拟合线性回归 Note: 线性回归的输入数据必须全部是数值类型,其他类型自行预处理 Parameters ---------- X: 特征矩阵,DataFrame类型 y: 观测值向量,Series类型 output: 是否返回求解的参数向量,bool类型,默认False show_time: 是否显示时间开销,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 返回求解的参数向量,Series类型 ------- ''' start = time.clock() #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: check_type('output', type(output), type(True)) X = self.check_input_X_(X) self.check_input_y_(y) check_index_match(X, y, 'X', 'y') #归一化校验 range_ = X.iloc[:, 1:].max() - X.iloc[:, 1:].min() if (range_.max() < 1.1) & (range_.min() > 0.9): if (self.learning_rate < 0.1) & (self.fit_mode == 'sgd'): print( '\nit is recommended to change learning_rate over 0.1 for scaled X' ) else: print('\nit is recommended to scale X') #选择不同的拟合方式 print('\nfitting ---') if self.fit_mode == 'ne': theta = self.fit_by_ne_(X, y) self.theta = theta p_y = self.predict(X, check_input=False) a_result = self.assess(y, p_y, detailed=True, check_input=False) self.cost = a_result.loc['cost'] self.cost_min = self.cost self.score = a_result.loc['r_sqr'] elif self.fit_mode == 'sgd': theta, theta_h, cost_min, cost_h = self.fit_by_sgd_(X, y) self.theta = theta self.theta_h = theta_h self.cost = cost_h.iloc[-1] self.cost_min = cost_min self.cost_h = cost_h try: p_y = self.predict(X, check_input=False) self.score = self.assess(y, p_y, check_input=False) except: print('\nwarning: fail to assess on train') time_cost = time.clock() - start if show_time == True: print('\ntime used for training: %f' % time_cost) #返回求得的参数 if output == True: return theta
def check_input_t_(self, theta): check_type('theta', type(theta), type(pd.Series())) type_list = [np.float64] check_type('theta', theta.dtype, type_list)
def check_input_y_(self, y, name='y'): check_type(name, type(y), type(pd.Series())) type_list = [np.int64, np.float64] check_type(name, y.dtype, type_list)
def predict(self,X,units=None,mode=None,classes=None,units_result=False, return_proba=False,return_paths=False,show_time=False): '''\n Function: 使用输入数据和所有集成单元进行预测,没有输入集成单元时使用内部缓存 Parameters ---------- X: 所有特征列,DataFrame类型 units: 集成单元,list(DecitionTree)类型,默认调用内部缓存 mode: 模式,分类->'c',回归->'r',默认'c' classes: 分类标签列表,list(str)类型 units_result: 是否返回每个单元的分类结果,bool类型,默认False return_proba: 是否返回分类概率,分类模式下有效,bool类型,默认值False, 分类概率不能直接用于评估 return_paths: 是否返回决策路径,bool类型,默认值False (路径信息以str类型返回,可转换为list使用) show_time: 是否显示耗时,bool类型,默认值False ---------- Returns ------- 0: 预测的分类/分类概率,Series/DataFrame类型 1: 各个单元的预测的分类/分类概率,list(Series)/list(DataFrame)类型 2: 所有数据最终抵达的节点和决策路径,list(DataFrame)类型 ------- ''' start = time.clock() #校验参数 if type(units)==type(None): units=self.units if type(mode)==type(None): mode=self.mode check_type('mode',type(mode),type('')) mode_list=['c','r'] check_limit('mode',mode in mode_list,str(mode_list)) if (type(classes)==type(None))&(mode=='c'): classes=self.classes check_type('units',type(units),type([])) if len(units)==0: raise ValueError('lack of units') if mode=='r': check_type('element in units',type(units[0]),type(dt.DecisionTree())) elif mode=='c': check_type('element in units',type(units[0][0]),type(dt.DecisionTree())) check_type('return_proba',type(return_proba),type(True)) check_type('return_paths',type(return_paths),type(True)) check_type('show_time',type(show_time),type(True)) X,continuity_X=self.unit_test.check_input_X_(X) features=[] if mode=='c': for units_ in units: for unit in units_: features+=unit.tree.features elif mode=='r': for unit in units: features+=unit.tree.features features=list(set(features)) check_items_match(X.columns,features,'X','unit','features',mode='right') #分类模式先求分类概率,回归模式直接求回归值 n=len(X) if mode=='c': #定义存放分类结果的DataFrame p_y=pd.DataFrame( np.zeros((n,len(classes))), index=X.index,columns=classes) elif mode=='r': #定义存放回归值的Series p_y=pd.Series(np.zeros(n),index=X.index) #逐个调用每个单元进行预测,并将结果累加 units_p_y,units_paths=[],[] for i in range(len(units)): if show_time==True: print('\npredicting with unit %d ---'%i) if mode=='r': if return_paths==True: p_y_,paths=units[i].predict(X,return_proba=True,return_paths=True, show_time=show_time,check_input=False) units_paths.append(paths) else: p_y_=units[i].predict(X,return_proba=True,return_paths=False, show_time=show_time,check_input=False) p_y+=p_y_ if units_result==True: if (mode=='c')&(return_proba==False): p_y_=units[i].choose_class_(p_y_,classes) units_p_y.append(p_y_) #分类模式需要调用子单元对每个类的概率进行预测 elif mode=='c': classes_p_y,classes_paths=[],[] for j in range(len(classes)): if return_paths==True: p_y_,paths=units[i][j].predict(X,return_proba=True,return_paths=True, show_time=show_time,check_input=False) classes_paths.append(paths) else: p_y_=units[i][j].predict(X,return_proba=True,return_paths=False, show_time=show_time,check_input=False) p_y.iloc[:,j]+=p_y_ if units_result==True: if (mode=='c')&(return_proba==False): p_y_=units[i].choose_class_(p_y_,classes) classes_p_y.append(p_y_) if return_paths==True: units_paths.append(classes_paths) if units_result==True: units_p_y.append(classes_p_y) #返回分类概率或唯一分类 if (mode=='c')&(return_proba==False): p_y=self.unit_test.choose_class_(p_y,classes) end = time.clock() if show_time==True: print('\ntotal time used for predict: %f'%(end-start)) if units_result==True: if return_paths==True: return p_y,units_p_y,paths else: return p_y,units_p_y else: if return_paths==True: return p_y,paths else: return p_y
def fit(self,X,y,show_time=False): '''\n Function: 使用输入数据拟合梯度提升(决策树) Note: 数据列的连续性会进行自动判断,不被支持的类型需要预处理 (int64,float64)->连续 (bool,category,object)->离散 所有离散数据会强制转换为str标签 Description: 对迭代轮数t=1,2,...T有: (a)对样本i=1,2,...m,计算代价函数的负梯度 (b)利用负梯度作为目标值, 拟合一颗弱学习器 (c)为弱学习器拟合一个权重使当前代价最小,更新强学习器 Parameters ---------- X: 特征列,DataFrame类型 y: 目标列,Series类型 show_time: 是否显示耗时,bool类型,默认值False ---------- ''' start = time.clock() check_type('show_time',type(show_time),type(True)) #校验X,y输入 X,self.continuity_X,self.mapping_X,X0=\ self.unit_test.check_input_X_(X,to_index=True,return_source=True) y,self.continuity_y,self.mapping_y,y0=\ self.unit_test.check_input_y_(y,to_index=True,return_source=True) #校验X,y输入是否匹配 check_index_match(X,y,'X','y') feature_use_n=len(X.columns) #特征标签 self.features=X.columns.tolist() #初始化强学习器的预测值向量 n=len(y) self.units_p_y=[] self.r_h=[] if self.mode=='c': self.classes=y0.drop_duplicates().sort_values().astype('str').tolist() y=dp.dummy_var(y) #定义存放分类结果的DataFrame p_y=pd.DataFrame( stats.softmax(np.zeros((n,len(self.classes)))), index=y.index,columns=y.columns) elif self.mode=='r': #定义存放回归值的Series p_y=pd.Series(np.zeros(n),index=X.index) #迭代训练弱学习器 self.units=[] for i in range(self.iter_max): if show_time==True: print('\nfitting with unit %d ---'%i) #针对预测值向量计算负梯度作为下一轮的拟合目标 r=self.learning_rate*self.gradient_(y,p_y,self.mode) self.r_h.append(r) #提前结束拟合(暂未设置阈值,所以是0) if (r**2).values.sum()<=0: print('\nwarning: early stopping') break if self.mode=='r': #构建并拟合模型 unit=dt.DecisionTree(mode=self.units_mode,model_type=self.units_type, depth_max=self.depth_max) unit.continuity_X,unit.mapping_X=self.continuity_X,self.mapping_X unit.continuity_y,unit.mapping_y=self.continuity_y,self.mapping_y unit.features_use_n=feature_use_n unit.fit(X,r,show_time=show_time,check_input=False) #计算弱学习器的预测值 p_y_=unit.predict(X,return_proba=True) #为弱学习器计算一个乘数,使代价最小(一维优化问题) #注:也可以尝试对每个叶节点区域拟合乘数,精度更高 # 即使不拟合该乘数,gbdt也能正常运作 try: gamma=(r*p_y_).sum()/(p_y_**2).sum() for node in unit.tree.nodes: if node.is_leaf==True: node.output*=gamma except ZeroDivisionError: gamma=1 p_y+=gamma*p_y_ #添加进强学习器 unit.continuity_X,unit.mapping_X=None,None unit.continuity_y,unit.mapping_y=None,None self.units.append(unit) self.units_p_y.append(p_y_) elif self.mode=='c': sub_units,sub_units_p_y=[],[] #对每个类别的预测概率按负梯度方向的目标概率值变化量拟合弱学习器 for j in range(len(self.classes)): if show_time==True: print('\n|| sub-unit for class %s'%str(self.classes[j])) #构建并拟合模型 unit=dt.DecisionTree(mode=self.units_mode,model_type=self.units_type, depth_max=self.depth_max) unit.continuity_X,unit.mapping_X=self.continuity_X,self.mapping_X unit.continuity_y,unit.mapping_y=self.continuity_y,self.mapping_y unit.features_use_n=feature_use_n r_=r.iloc[:,j] unit.fit(X,r_,show_time=show_time,check_input=False) #计算弱学习器的预测值 p_y_=unit.predict(X,return_proba=True) sub_units_p_y.append(p_y_) #为弱学习器计算一个权重,使代价最小(一维优化问题) try: gamma=(r_*p_y_).sum()/(p_y_**2).sum() for node in unit.tree.nodes: if node.is_leaf==True: node.output*=gamma except ZeroDivisionError: gamma=1 p_y.iloc[:,j]+=gamma*p_y_ #添加进强学习器当前层集合 unit.continuity_X,unit.mapping_X=None,None unit.continuity_y,unit.mapping_y=None,None sub_units.append(unit) #添加进强学习器 self.units.append(sub_units) self.units_p_y.append(sub_units_p_y) end = time.clock() if show_time==True: print('\ntotal time used for trainning: %f'%(end-start))
def fit(self,X,y,show_time=False): '''\n Function: 使用输入数据拟合随机森林 Note: 数据列的连续性会进行自动判断,不被支持的类型需要预处理 (int64,float64)->连续 (bool,category,object)->离散 所有离散数据会强制转换为str标签 Description: (a)从大小为N的训练集中随机且有放回地抽取N个样本(bootstrap sample)作为每棵树的训练集 (b)每个节点分裂时,从总共M个特征中随机地选取m个特征子集(m<<M),从这m个特征中选择最优分裂 (c)每棵树都尽最大程度的生长,并且没有剪枝过程 (d)使用未被当前树训练集选中的数据作为测试集计算泛化误差(out of bag error) Parameters ---------- X: 特征列,DataFrame类型 y: 目标列,Series类型 show_time: 是否显示耗时,bool类型,默认值False ---------- ''' start = time.clock() check_type('show_time',type(show_time),type(True)) #校验X,y输入 X,self.continuity_X,self.mapping_X,X0=\ self.unit_test.check_input_X_(X,to_index=True,return_source=True) y,self.continuity_y,self.mapping_y,y0=\ self.unit_test.check_input_y_(y,to_index=True,return_source=True) #校验X,y输入是否匹配 check_index_match(X,y,'X','y') #计算每次分裂使用的特征数量上限 self.features_use_n=self.unit_test.compute_features_use_n_(len(X.columns),self.features_use) #集成单元序列和集成单元oob评分列表 self.units,self.units_oob_score=[],[] self.features=X.columns.tolist() #oob袋外数据预测矩阵初始化 if self.mode=='c': self.classes=y0.drop_duplicates().sort_values().astype('str').tolist() oob_predict=pd.DataFrame(np.zeros((len(X.index),len(self.classes))), index=X.index,columns=self.classes) elif self.mode=='r': self.classes=[] oob_predict=pd.Series(np.zeros(len(X.index)),index=X.index) oob_trees_n=pd.Series(np.zeros(len(X.index)),index=X.index) #逐个拟合(有尝试过使用原生python的多进程和多线程,但效果不佳) for i in range(self.units_n): if show_time==True: print('\nfitting with unit %d ---'%i) #随机有放回抽样生成训练集,大小不变,同时提取oob样本 #注:注意重新生成一下索引,有放回抽样会产生重复索引 X_=X.sample(frac=1.0,replace=True) y_=y[X_.index] iob_index=X_.index.drop_duplicates() oob_X0_=X0[~X0.index.isin(iob_index)] oob_y0_=y0[oob_X0_.index] X_.index=range(len(X_)) y_.index=range(len(y_)) #构建并拟合模型 unit=dt.DecisionTree(mode=self.mode,model_type=self.units_type,depth_max=self.depth_max, split_sample_n=self.split_sample_n,leaf_sample_n=self.leaf_sample_n, features_use=self.features_use,features_reuse=self.features_reuse) unit.continuity_X,unit.mapping_X=self.continuity_X,self.mapping_X unit.continuity_y,unit.mapping_y=self.continuity_y,self.mapping_y unit.features_use_n=self.features_use_n unit.fit(X_,y_,show_time=show_time,check_input=False) #obb预测 if self.mode=='c': p_y_=unit.predict(oob_X0_,return_proba=True,check_input=False) p_y_0=unit.choose_class_(p_y_,self.classes) score_=unit.assess(oob_y0_,p_y_0,check_input=False) oob_predict.loc[p_y_.index,:]+=p_y_ elif self.mode=='r': p_y_=unit.predict(oob_X0_,check_input=False) score_=unit.assess(oob_y0_,p_y_,check_input=False) oob_predict.loc[p_y_.index]+=p_y_ oob_trees_n.loc[p_y_.index]+=1 self.units_oob_score.append(score_) #添加进随机森林 unit.mapping_X,unit.mapping_y=None,None self.units.append(unit) #oob整体预测 #注:由于存在少量数据不满足oob条件所以没有预测结果,需要筛去 boolIdx=(oob_trees_n!=0.0) if self.mode=='c': oob_predict=self.unit_test.choose_class_(oob_predict[boolIdx],self.classes) elif self.mode=='r': oob_predict=oob_predict[boolIdx]/oob_trees_n[boolIdx] score=self.unit_test.assess(y0[boolIdx],oob_predict,mode=self.mode,check_input=False) self.oob_score=score end = time.clock() if show_time==True: print('\ntotal time used for trainning: %f'%(end-start))