예제 #1
0
 def assess(self, y, p_y, return_dist=False, check_input=True):
     '''\n
     Function: 执行模型评估
     
     Note: 拟合后关于训练集的accuracy和cost已保存在内部属性中,
           通过.score和.cost查看
     
     Parameters
     ----------
     y: 观测值向量,Series类型
     p_y: 预测值向量,Series类型
     return_dist: 是否返回预测分布,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: 准确率,float类型
     1: 预测分布,DataFrame类型
     -------
     '''
     #输入校验
     if check_input == True:
         y = self.check_input_y_(y)
         p_y = self.check_input_y_(p_y, 'p_y')
         check_index_match(y, p_y, 'y', 'p_y')
     #返回准确率和预测分布
     return stats.accuracy(y, p_y, return_dist, self.classes)
예제 #2
0
 def assess(self,y,p_y,mode=None):
     '''\n
     Function: 使用输入的观测值和预测值进行模型评估
     
     Notes: 注意数据集的数据类型,分类首选类型str,回归首选类型float64,
            拟合时数据集采用非首选类型可能会导致此处类型不匹配,建议提前转换
     
     Parameters
     ----------
     y:观测值,Series类型
     p_y:预测值,Series类型
     mode:模式,str类型,默认使用内部集成单元的属性,
          'c'->分类,'r'->回归
     ----------
     
     Returns
     -------
     0: 分类->准确率,回归->R方,float类型
     -------
     '''
     #校验参数
     if type(mode)==type(None):
         mode=self.units[0].tree.mode
     check_type('mode',type(mode),type(''))
     mode_list=['c','r']
     check_limit('mode',mode in mode_list,str(mode_list))
     y,continuity_y=self.unit_test.check_input_y_(y,name='y')
     p_y,continuity_p_y=self.unit_test.check_input_y_(p_y,name='p_y')
     check_index_match(y,p_y,'y','p_y')
     #分类模式求准确率,回归模式求R2
     if mode=='c':
         return stats.accuracy(y,p_y)
     elif mode=='r':
         return stats.r_sqr(y,p_y)
예제 #3
0
 def assess(self, y, p_y, return_dist=False, check_input=True):
     '''\n
     Function: 使用输入的观测值和预测值进行模型评估
     
     Notes: 注意数据集的数据类型,分类首选类型str,回归首选类型float64,
            拟合时数据集采用非首选类型可能会导致此处类型不匹配,建议提前转换
     
     Parameters
     ----------
     y:观测值,Series类型
     p_y:预测值,Series类型
     return_dist: 是否返回预测分布,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: 分类->准确率,回归->R方,float类型
     -------
     '''
     mode = self.mode
     #校验输入
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         y = self.check_input_y_(y, name='y', mode=mode)
         p_y = self.check_input_y_(p_y, name='p_y', mode=mode)
         check_index_match(y, p_y, 'y', 'p_y')
     #分类模式求准确率,回归模式求R2
     if mode == 'c':
         return stats.accuracy(y, p_y, return_dist, self.classes)
     elif mode == 'r':
         return stats.r_sqr(y, p_y)
예제 #4
0
 def assess(self, y, pred_y, return_dist=False, check_input=True):
     '''\n
     Function: 使用输入的观测值和预测值进行模型评估
     
     Notes: 注意数据集的数据类型,分类首选类型str,回归首选类型float64,
            拟合时数据集采用非首选类型可能会导致此处类型不匹配,建议提前转换
     
     Parameters
     ----------
     y: 观测值,ndarray类型
     pred_y: 预测值,ndarray类型
     return_dist: 是否返回预测分布,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: 分类->准确率,回归->R方,float类型
     -------
     '''
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         check_type('return_dist', type(return_dist), type(True))
         y = self.check_input_y_(y, 'y', transform=False)
         pred_y = self.check_input_y_(pred_y, 'pred_y', transform=False)
         check_index_match(y, pred_y, 'y', 'pred_y', only_len=True)
     if self.mode == 'c':
         return stats.accuracy(y, pred_y, return_dist, self.classes)
     else:
         return stats.r_sqr(y, pred_y)
예제 #5
0
 def access(self, y, p_y, return_dist=False):
     check_type('return_dist', type(return_dist), type(True))
     y = self.check_input_y_(y)
     p_y = self.check_input_y_(p_y, 'p_y')
     check_index_match(y, p_y, 'y', 'p_y')
     classes = self.y_d_p['value'].values
     return stats.accuracy(y, p_y, return_dist, classes)
예제 #6
0
 def fit(self, X, y):
     start = time.clock()
     X = self.check_input_X_(X)
     y = self.check_input_y_(y)
     check_index_match(X, y, 'X', 'y')
     self.c_p, self.X_d_p, self.y_d_p = self.c_prob_(X, y)
     print('\ntime used for training: %f' % (time.clock() - start))
예제 #7
0
 def selection(self,test_X,test_y,units=None,units_oob_score=None,
               use='oob',return_units=False,show_time=False):
     '''\n
     Function: 在生成好的模型上进行选择,筛选出集成单元的一个子集
     
     Notes: 作用类似于决策树的剪枝,通过一些规则生成可选子集,
            再通过在测试集上的表现选择最优的一个,能够得到更简单且泛化能力更强的模型
     
     Parameters
     ----------
     test_X: 测试集特征列,DataFrame类型
     test_y: 测试集目标列,Series类型
     units: 集成单元,list(DecitionTree)类型
     units_oob_score: 集成单元obb评分,list(float)类型
     use: 使用的选择方法,str类型,默认'oob'
          'rd'->随机选择,'oob'->oob选择
     return_units: 是否以返回值形式给到选择后的集成单元,bool类型,默认False
     show_time: 是否显示耗时,bool类型,默认值False
     ----------
     
     Returns
     -------
     0: 分类->准确率,回归->R方,float类型
     -------
     '''
     start = time.clock()
     if units==None:
         units=self.units
     if units_oob_score==None:
         units_oob_score=self.units_oob_score
     #输入校验
     check_type('units',type(units),type([]))
     check_type('element in units',type(units[0]),type(dt.DecisionTree()))
     check_type('units_oob_score',type(units_oob_score),type([]))
     check_type('element in units_oob_score',type(units_oob_score[0]),[type(0.0),np.float64])
     check_type('use',type(use),type(''))
     check_type('return_units',type(return_units),type(True))
     use_list=['rd','oob']
     check_limit('use',use in use_list,str(use_list))
     test_X,continuity_X=self.unit_test.check_input_X_(test_X,'test_X')
     test_y,continuity_y=self.unit_test.check_input_y_(test_y,'test_y')
     check_index_match(test_X,test_y,'test_X','test_y')
     features=[]
     for unit in units:
         features+=unit.tree.features
     features=list(set(features))
     check_items_match(test_X.columns,features,'test_X','tree','features',mode='right')
     #选择
     if use=='rd':
         subset=self.random_selection_(test_X,test_y,units)
     elif use=='oob':
         subset=self.oob_selection_(test_X,test_y,units,units_oob_score)
     end = time.clock()
     if show_time==True:
         print('\ntime used for selection:%f'%(end-start))
     if return_units==False:
         self.units=subset
     else:
         return subset
예제 #8
0
 def fit(self,
         X,
         y,
         test_X=None,
         test_y=None,
         show_time=False,
         monitor_cost=False,
         monitor_score=False,
         check_input=True):
     '''\n
     Function: 使用输入数据拟合神经网络
     
     Note: 输入数据必须全部是连续数值类型,其他类型自行预处理
     
     Parameters
     ----------
     X: 特征矩阵,ndarray(samples_n,input_shape)<float64,int64>类型
     y: 目标向量,ndarray(samples_n,)<str,float64,int64>类型
     test_X: 测试特征矩阵,ndarray(samples_n,input_shape)<float64,int64>类型
     test_y: 测试目标向量,ndarray(samples_n,)<str,float64,int64>类型
     show_time: 是否显示时间开销,bool类型,默认False
     monitor_cost: 监控cost变化,bool类型,默认值False
     monitor_score: 监控score变化,bool类型,默认值False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     '''
     start = time.clock()
     #输入校验
     start1 = time.clock()
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         check_type('show_time', type(show_time), type(True))
         check_type('monitor_cost', type(monitor_cost), type(True))
         check_type('monitor_score', type(monitor_score), type(True))
         X = self.check_input_X_(X)
         y, self.classes = self.check_input_y_(y)
         if (len(self.classes) < 2) & (self.mode == 'c'):
             raise ValueError('too few classes,should >1')
         check_index_match(X, y, 'X', 'y', only_len=True)
         if type(test_X) != type(None):
             test_X = self.check_input_X_(test_X, name='test_X')
             test_y, test_classes = self.check_input_y_(test_y,
                                                        name='test_y')
             check_index_match(test_X,
                               test_y,
                               'test_X',
                               'test_y',
                               only_len=True)
     self.time_cost['input check'] += time.clock() - start1
     #优化
     self.optimize_(X, y, test_X, test_y, monitor_cost, monitor_score)
     if show_time == True:
         print('\ntime used for training: %f' % (time.clock() - start))
     self.time_cost['Total'] += time.clock() - start
예제 #9
0
 def assess(self, y, p_y, theta=None, detailed=False, check_input=True):
     '''\n
     Function: 执行模型评估
     
     Note: 拟合后关于训练集的r2和cost已保存在内部属性中,
           通过.score和.cost查看
     
     Parameters
     ----------
     y: 观测值向量,Series类型
     p_y: 预测值向量,Series类型
     detailed: 是否返回详细评估,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: r2或评估结果表,float类型或Series类型
     -------
     '''
     #外部传入theta或使用内部缓存
     if type(theta) == type(None):
         theta = self.theta
     #输入校验
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         check_type('detailed', type(detailed), type(True))
         self.check_input_y_(y)
         self.check_input_y_(p_y, 'p_y')
         check_index_match(y, p_y, 'y', 'p_y')
     #r2计算
     r_sqr = stats.r_sqr(y, p_y)
     #是否进行详细评估
     if detailed == False:
         return r_sqr
     else:
         k, n = len(theta), len(y)
         cost = self.cost_(y, p_y)
         #计算调整r2和代价值
         adj_r_sqr = stats.adj_r_sqr(r_sqr, n, k)
         a_result = []
         #f_value=self.f_test(p_y,y,len(x),len(theta))
         a_result.append(('r_sqr', r_sqr))
         a_result.append(('adj_r_sqr', adj_r_sqr))
         a_result.append(('cost', cost))
         a_result = pd.DataFrame(a_result, columns=['index', 'value'])
         a_result = a_result.set_index('index').iloc[:, 0]
         return a_result
예제 #10
0
 def assess(self,y,p_y,mode=None):
     '''\n
     Function: 使用输入的观测值和预测值进行模型评估
     
     Notes: 注意数据集的数据类型,分类首选类型str,回归首选类型float64,
            拟合时数据集采用非首选类型可能会导致此处类型不匹配,建议提前转换
     
     Parameters
     ----------
     y:观测值,Series类型
     p_y:预测值,Series类型
     mode:模式,str类型,默认使用内部集成单元的属性,
          'c'->分类,'r'->回归
     ----------
     
     Returns
     -------
     0: 分类->准确率,回归->R方,float类型
     -------
     '''
     #校验参数
     if type(mode)==type(None):
         mode=self.mode
     check_type('mode',type(mode),type(''))
     mode_list=['c','r']
     check_limit('mode',mode in mode_list,str(mode_list))
     check_index_match(y,p_y,'y','p_y')
     #分类模式求准确率,回归模式求R2
     if mode=='c':
         return stats.accuracy(y.astype('str'),p_y.astype('str'))
     elif mode=='r':
         r_sqr=stats.r_sqr(y,p_y)
         if r_sqr<0:
             print('warning: R2 is less than 0, which means bad fitting,'+
                   '\ntry to reduce the learning rate')
         return r_sqr
예제 #11
0
 def fit(self, X, y, output=False, show_time=False, check_input=True):
     '''\n
     Function: 使用输入数据拟合逻辑回归
     
     Note: 逻辑回归的特征输入为连续型数值,分类输出为离散标签
     
     Parameters
     ----------
     X: 特征矩阵,DataFrame类型
     y: 观测值向量,Series类型
     output: 是否返回求解的参数向量,bool类型,默认False
     show_time: 是否显示时间开销,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: 返回求解的参数向量,Series类型
     -------
     '''
     start = time.clock()
     #输入校验
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         check_type('output', type(output), type(True))
         X = self.check_input_X_(X)
         y = self.check_input_y_(y)
         check_index_match(X, y, 'X', 'y')
     #判断类别数量
     values = y.sort_values().drop_duplicates().tolist()
     features_n, classes_n = len(X.columns), len(values)
     if classes_n <= 1:
         raise ValueError('classes_n in y should >=2')
     if classes_n >= 0.5 * len(y):
         print('\nwarning: too many classes in y')
     self.classes = values
     #归一化校验
     range_ = X.iloc[:, 1:].max() - X.iloc[:, 1:].min()
     if (range_.max() < 1.1) & (range_.min() > 0.9):
         if self.learning_rate < 0.1:
             print(
                 '\nit is recommended to change learning_rate over 0.1 for scaled X'
             )
     else:
         print('\nit is recommended to scale X')
     #将单列的多类别分类值转换为多列的01类别判断,索引(记录,类)->属于该类
     Y = dp.dummy_var(y)
     theta_h, cost_h = [], []
     #多分类模式ovr
     if self.multi_class == 'ovr':
         theta = np.zeros((features_n, classes_n))
         cost_min = np.zeros(classes_n)
         cost = np.zeros(classes_n)
         for i in range(classes_n):
             print('\nfitting classifier %d ---' % i)
             theta_, theta_h_, cost_min_, cost_h_ = self.fit_by_sgd_(
                 X, Y.iloc[:, i])
             theta[:, i], cost_min[i], cost[
                 i] = theta_, cost_min_, cost_h_.iloc[-1]
             theta_h.append(theta_h_)
             cost_h.append(cost_h_)
         self.classes_paired = None
     #多分类模式ovo
     elif self.multi_class == 'ovo':
         #正负样本选取矩阵,索引(组合,类)->取用
         class_p, class_n = dp.combine_enum_paired(list(range(classes_n)))
         #应用正负样本选取矩阵后的分类情况,索引(记录,组合)->分类判断
         #1->正样本分类,0->负样本分类,0.5->无法判别
         Y_ = (np.dot(Y, class_p.T) - np.dot(Y, class_n.T) + 1.0) / 2.0
         Y_ = pd.DataFrame(Y_, index=Y.index)
         combines_n = len(Y_.columns)
         theta = np.zeros((features_n, combines_n))
         cost_min = np.zeros(combines_n)
         cost = np.zeros(combines_n)
         for i in range(combines_n):
             print('\nfitting classifier %d ---' % i)
             theta_, theta_h_, cost_min_, cost_h_ = self.fit_by_sgd_(
                 X, Y_.iloc[:, i])
             theta[:, i], cost_min[i], cost[
                 i] = theta_, cost_min_, cost_h_.iloc[-1]
             theta_h.append(theta_h_)
             cost_h.append(cost_h_)
         self.classes_paired = class_p - class_n
     theta = pd.DataFrame(theta)
     cost_min = pd.Series(cost_min)
     cost = pd.Series(cost)
     self.theta = theta
     self.theta_h = theta_h
     self.cost_min = cost_min
     self.cost_h = cost_h
     self.cost = cost
     p_y = self.predict(X, check_input=False)
     self.score = self.assess(y, p_y, check_input=False)
     time_cost = time.clock() - start
     if show_time == True:
         print('\ntime used for training: %f' % time_cost)
     #返回求得的参数
     if output == True:
         return theta
예제 #12
0
 def fit(self, X, y, output=False, show_time=False, check_input=True):
     '''\n
     Function: 使用输入数据拟合线性回归
     
     Note: 线性回归的输入数据必须全部是数值类型,其他类型自行预处理
     
     Parameters
     ----------
     X: 特征矩阵,DataFrame类型
     y: 观测值向量,Series类型
     output: 是否返回求解的参数向量,bool类型,默认False
     show_time: 是否显示时间开销,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: 返回求解的参数向量,Series类型
     -------
     '''
     start = time.clock()
     #输入校验
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         check_type('output', type(output), type(True))
         X = self.check_input_X_(X)
         self.check_input_y_(y)
         check_index_match(X, y, 'X', 'y')
     #归一化校验
     range_ = X.iloc[:, 1:].max() - X.iloc[:, 1:].min()
     if (range_.max() < 1.1) & (range_.min() > 0.9):
         if (self.learning_rate < 0.1) & (self.fit_mode == 'sgd'):
             print(
                 '\nit is recommended to change learning_rate over 0.1 for scaled X'
             )
     else:
         print('\nit is recommended to scale X')
     #选择不同的拟合方式
     print('\nfitting ---')
     if self.fit_mode == 'ne':
         theta = self.fit_by_ne_(X, y)
         self.theta = theta
         p_y = self.predict(X, check_input=False)
         a_result = self.assess(y, p_y, detailed=True, check_input=False)
         self.cost = a_result.loc['cost']
         self.cost_min = self.cost
         self.score = a_result.loc['r_sqr']
     elif self.fit_mode == 'sgd':
         theta, theta_h, cost_min, cost_h = self.fit_by_sgd_(X, y)
         self.theta = theta
         self.theta_h = theta_h
         self.cost = cost_h.iloc[-1]
         self.cost_min = cost_min
         self.cost_h = cost_h
         try:
             p_y = self.predict(X, check_input=False)
             self.score = self.assess(y, p_y, check_input=False)
         except:
             print('\nwarning: fail to assess on train')
     time_cost = time.clock() - start
     if show_time == True:
         print('\ntime used for training: %f' % time_cost)
     #返回求得的参数
     if output == True:
         return theta
예제 #13
0
    def fit(self,X,y,show_time=False):
        '''\n
        Function: 使用输入数据拟合梯度提升(决策树)
        
        Note: 数据列的连续性会进行自动判断,不被支持的类型需要预处理
              (int64,float64)->连续
              (bool,category,object)->离散
              所有离散数据会强制转换为str标签
              
        Description: 对迭代轮数t=1,2,...T有:
            (a)对样本i=1,2,...m,计算代价函数的负梯度
            (b)利用负梯度作为目标值, 拟合一颗弱学习器
            (c)为弱学习器拟合一个权重使当前代价最小,更新强学习器

        Parameters
        ----------
        X: 特征列,DataFrame类型
        y: 目标列,Series类型
        show_time: 是否显示耗时,bool类型,默认值False
        ----------  
        '''
        start = time.clock()
        check_type('show_time',type(show_time),type(True))
        #校验X,y输入
        X,self.continuity_X,self.mapping_X,X0=\
            self.unit_test.check_input_X_(X,to_index=True,return_source=True)
        y,self.continuity_y,self.mapping_y,y0=\
            self.unit_test.check_input_y_(y,to_index=True,return_source=True)
        #校验X,y输入是否匹配
        check_index_match(X,y,'X','y')
        feature_use_n=len(X.columns)
        #特征标签
        self.features=X.columns.tolist()
        #初始化强学习器的预测值向量
        n=len(y)
        self.units_p_y=[]
        self.r_h=[]
        if self.mode=='c':
            self.classes=y0.drop_duplicates().sort_values().astype('str').tolist()
            y=dp.dummy_var(y)
            #定义存放分类结果的DataFrame
            p_y=pd.DataFrame(
                    stats.softmax(np.zeros((n,len(self.classes)))),
                    index=y.index,columns=y.columns)
        elif self.mode=='r':
            #定义存放回归值的Series
            p_y=pd.Series(np.zeros(n),index=X.index)
        #迭代训练弱学习器
        self.units=[]
        for i in range(self.iter_max):
            if show_time==True:
                print('\nfitting with unit %d ---'%i)
            #针对预测值向量计算负梯度作为下一轮的拟合目标
            r=self.learning_rate*self.gradient_(y,p_y,self.mode)
            self.r_h.append(r)
            #提前结束拟合(暂未设置阈值,所以是0)
            if (r**2).values.sum()<=0:
                print('\nwarning: early stopping')
                break
            if self.mode=='r':
                #构建并拟合模型
                unit=dt.DecisionTree(mode=self.units_mode,model_type=self.units_type,
                                     depth_max=self.depth_max)
                unit.continuity_X,unit.mapping_X=self.continuity_X,self.mapping_X
                unit.continuity_y,unit.mapping_y=self.continuity_y,self.mapping_y
                unit.features_use_n=feature_use_n
                unit.fit(X,r,show_time=show_time,check_input=False)
                #计算弱学习器的预测值
                p_y_=unit.predict(X,return_proba=True)
                #为弱学习器计算一个乘数,使代价最小(一维优化问题)
                #注:也可以尝试对每个叶节点区域拟合乘数,精度更高
                #    即使不拟合该乘数,gbdt也能正常运作
                try:
                    gamma=(r*p_y_).sum()/(p_y_**2).sum()
                    for node in unit.tree.nodes:
                        if node.is_leaf==True:
                            node.output*=gamma
                except ZeroDivisionError:
                    gamma=1
                p_y+=gamma*p_y_
                #添加进强学习器
                unit.continuity_X,unit.mapping_X=None,None
                unit.continuity_y,unit.mapping_y=None,None
                self.units.append(unit)
                self.units_p_y.append(p_y_)
            elif self.mode=='c':
                sub_units,sub_units_p_y=[],[]
                #对每个类别的预测概率按负梯度方向的目标概率值变化量拟合弱学习器
                for j in range(len(self.classes)):
                    if show_time==True:
                        print('\n|| sub-unit for class %s'%str(self.classes[j]))
                    #构建并拟合模型
                    unit=dt.DecisionTree(mode=self.units_mode,model_type=self.units_type,
                                         depth_max=self.depth_max)
                    unit.continuity_X,unit.mapping_X=self.continuity_X,self.mapping_X
                    unit.continuity_y,unit.mapping_y=self.continuity_y,self.mapping_y
                    unit.features_use_n=feature_use_n
                    r_=r.iloc[:,j]
                    unit.fit(X,r_,show_time=show_time,check_input=False)
                    #计算弱学习器的预测值
                    p_y_=unit.predict(X,return_proba=True)
                    sub_units_p_y.append(p_y_)
                    #为弱学习器计算一个权重,使代价最小(一维优化问题)
                    try:
                        gamma=(r_*p_y_).sum()/(p_y_**2).sum()
                        for node in unit.tree.nodes:
                            if node.is_leaf==True:
                                node.output*=gamma
                    except ZeroDivisionError:
                        gamma=1
                    p_y.iloc[:,j]+=gamma*p_y_
                    #添加进强学习器当前层集合
                    unit.continuity_X,unit.mapping_X=None,None
                    unit.continuity_y,unit.mapping_y=None,None
                    sub_units.append(unit)
                #添加进强学习器
                self.units.append(sub_units)
                self.units_p_y.append(sub_units_p_y)
        end = time.clock()
        if show_time==True:
            print('\ntotal time used for trainning: %f'%(end-start))  
예제 #14
0
    def fit(self,X,y,show_time=False):
        '''\n
        Function: 使用输入数据拟合随机森林
        
        Note: 数据列的连续性会进行自动判断,不被支持的类型需要预处理
              (int64,float64)->连续
              (bool,category,object)->离散
              所有离散数据会强制转换为str标签
              
        Description: 
            (a)从大小为N的训练集中随机且有放回地抽取N个样本(bootstrap sample)作为每棵树的训练集
            (b)每个节点分裂时,从总共M个特征中随机地选取m个特征子集(m<<M),从这m个特征中选择最优分裂
            (c)每棵树都尽最大程度的生长,并且没有剪枝过程
            (d)使用未被当前树训练集选中的数据作为测试集计算泛化误差(out of bag error)

        Parameters
        ----------
        X: 特征列,DataFrame类型
        y: 目标列,Series类型
        show_time: 是否显示耗时,bool类型,默认值False
        ----------
        '''
        start = time.clock()
        check_type('show_time',type(show_time),type(True))
        #校验X,y输入
        X,self.continuity_X,self.mapping_X,X0=\
            self.unit_test.check_input_X_(X,to_index=True,return_source=True)
        y,self.continuity_y,self.mapping_y,y0=\
            self.unit_test.check_input_y_(y,to_index=True,return_source=True)
        #校验X,y输入是否匹配
        check_index_match(X,y,'X','y')
        #计算每次分裂使用的特征数量上限
        self.features_use_n=self.unit_test.compute_features_use_n_(len(X.columns),self.features_use)
        #集成单元序列和集成单元oob评分列表
        self.units,self.units_oob_score=[],[]
        self.features=X.columns.tolist()
        #oob袋外数据预测矩阵初始化
        if self.mode=='c':
            self.classes=y0.drop_duplicates().sort_values().astype('str').tolist()
            oob_predict=pd.DataFrame(np.zeros((len(X.index),len(self.classes))),
                                     index=X.index,columns=self.classes)
        elif self.mode=='r':
            self.classes=[]
            oob_predict=pd.Series(np.zeros(len(X.index)),index=X.index)
        oob_trees_n=pd.Series(np.zeros(len(X.index)),index=X.index)
        #逐个拟合(有尝试过使用原生python的多进程和多线程,但效果不佳)
        for i in range(self.units_n):
            if show_time==True:
                print('\nfitting with unit %d ---'%i)
            #随机有放回抽样生成训练集,大小不变,同时提取oob样本
            #注:注意重新生成一下索引,有放回抽样会产生重复索引
            X_=X.sample(frac=1.0,replace=True)
            y_=y[X_.index]
            iob_index=X_.index.drop_duplicates()
            oob_X0_=X0[~X0.index.isin(iob_index)]
            oob_y0_=y0[oob_X0_.index]
            X_.index=range(len(X_))
            y_.index=range(len(y_))
            #构建并拟合模型
            unit=dt.DecisionTree(mode=self.mode,model_type=self.units_type,depth_max=self.depth_max,
                                 split_sample_n=self.split_sample_n,leaf_sample_n=self.leaf_sample_n,
                                 features_use=self.features_use,features_reuse=self.features_reuse)
            unit.continuity_X,unit.mapping_X=self.continuity_X,self.mapping_X
            unit.continuity_y,unit.mapping_y=self.continuity_y,self.mapping_y
            unit.features_use_n=self.features_use_n
            unit.fit(X_,y_,show_time=show_time,check_input=False)
            #obb预测
            if self.mode=='c':
                p_y_=unit.predict(oob_X0_,return_proba=True,check_input=False)
                p_y_0=unit.choose_class_(p_y_,self.classes)
                score_=unit.assess(oob_y0_,p_y_0,check_input=False)
                oob_predict.loc[p_y_.index,:]+=p_y_
            elif self.mode=='r':
                p_y_=unit.predict(oob_X0_,check_input=False)
                score_=unit.assess(oob_y0_,p_y_,check_input=False)
                oob_predict.loc[p_y_.index]+=p_y_
            oob_trees_n.loc[p_y_.index]+=1
            self.units_oob_score.append(score_)
            #添加进随机森林
            unit.mapping_X,unit.mapping_y=None,None
            self.units.append(unit)
        #oob整体预测
        #注:由于存在少量数据不满足oob条件所以没有预测结果,需要筛去
        boolIdx=(oob_trees_n!=0.0)
        if self.mode=='c':
            oob_predict=self.unit_test.choose_class_(oob_predict[boolIdx],self.classes)
        elif self.mode=='r':
            oob_predict=oob_predict[boolIdx]/oob_trees_n[boolIdx]
        score=self.unit_test.assess(y0[boolIdx],oob_predict,mode=self.mode,check_input=False)
        self.oob_score=score
        end = time.clock()
        if show_time==True:
            print('\ntotal time used for trainning: %f'%(end-start))
예제 #15
0
    def fit(self,X,y,show_time=False):
        '''\n
        Function: 使用输入数据拟合自适应提升(决策树)
        
        Note: 数据列的连续性会进行自动判断,不被支持的类型需要预处理
              (int64,float64)->连续
              (bool,category,object)->离散
              所有离散数据会强制转换为str标签
              
        Description: 对于m=1,2,…,M
            (a)使用具有权值分布Dm的训练数据集进行学习,得到弱学习器Gm(x)
            (b)计算Gm(x)在训练数据集上的误差率
            (c)计算Gm(x)在强学习器中所占的权重:
            (d)更新训练数据集的权值分布(需要归一化,使样本的概率分布和为1)

        Parameters
        ----------
        X: 特征列,DataFrame类型
        y: 目标列,Series类型
        show_time: 是否显示耗时,bool类型,默认值False
        ----------  
        '''
        start = time.clock()
        check_type('show_time',type(show_time),type(True))
        #校验X,y输入
        X,self.continuity_X,self.mapping_X,X0=\
            self.unit_test.check_input_X_(X,to_index=True,return_source=True)
        y,self.continuity_y,self.mapping_y,y0=\
            self.unit_test.check_input_y_(y,to_index=True,return_source=True)
        #校验X,y输入是否匹配
        check_index_match(X,y,'X','y')
        feature_use_n=len(X.columns)
        #特征/分类标签
        self.features=X.columns.tolist()
        if self.mode=='c':
            self.classes=y0.drop_duplicates().sort_values().astype('str').tolist()
            k=len(self.classes)
        elif self.mode=='r':
            k=0
        #迭代训练弱学习器
        sample_weight=np.ones(len(X))
        sample_weight=pd.Series(sample_weight/len(sample_weight),index=X.index)
        self.units,self.units_weight,self.units_error,self.fit_h=[],[],[],[]
        for i in range(self.iter_max):
            if show_time==True:
                print('\nfitting with unit %d ---'%i)
            #构建并拟合模型
            unit=dt.DecisionTree(mode=self.mode,model_type=self.units_type,
                                 depth_max=self.depth_max)
            unit.continuity_X,unit.mapping_X=self.continuity_X,self.mapping_X
            unit.continuity_y,unit.mapping_y=self.continuity_y,self.mapping_y
            unit.features_use_n=feature_use_n
            unit.fit(X,y,sample_weight,show_time=show_time,check_input=False)
            #计算当前弱学习器加权误差和预测器权重
            mode=unit.tree.mode
            p_y=unit.predict(X0,check_input=False)
            fit_h_=pd.DataFrame()
            fit_h_['y'],fit_h_['p_y'],fit_h_['sp_wgt']=y0,p_y,sample_weight
            self.fit_h.append(fit_h_)
            errors=self.errors_(y0,p_y,mode)
            wgt_err=self.wgt_err_(errors,sample_weight,mode)
            error=self.wgt_err_(errors,self.fit_h[0]['sp_wgt'],mode)
            #误差达到0,不需要继续训练
            if wgt_err==0.0:
                if show_time==True:
                    print('\nwarning: early stopping')
                break
            unit_weight=self.unit_weight_(self.learning_rate,wgt_err,k,mode)
            #权重大于0表示弱学习器优于随即猜测
            if unit_weight>0:
                self.units_weight.append(unit_weight)
                self.units_error.append([error,wgt_err])
                #添加进强学习器
                unit.continuity_X,unit.mapping_X=None,None
                unit.continuity_y,unit.mapping_y=None,None
                self.units.append(unit)
                #更新样本权重
                if i<self.iter_max-1:
                    sample_weight=self.sample_weight_(errors,unit_weight,sample_weight,mode)
                    sample_weight=pd.Series(sample_weight,index=X.index)
            else:
                if show_time==True:
                    print('\nwarning: unit is worse than random, discard')
        self.units_error=pd.DataFrame(self.units_error,columns=['err','wgt_err'])
        end = time.clock()
        if show_time==True:
            print('\ntotal time used for trainning: %f'%(end-start))
예제 #16
0
    def fit(self, X, y, keep_nonsv=False, show_time=False, check_input=True):
        '''\n
        Function: 使用输入数据拟合支持向量机
        
        Note: 输入数据必须全部是数值类型,其他类型自行预处理
        
        Parameters
        ----------
        X: 特征矩阵,DataFrame类型
        y: 目标向量,Series类型
        keep_nonsv: 是否保留非支持向量点,bool类型,默认False
        show_time: 是否显示时间开销,bool类型,默认False
        check_input: 是否进行输入校验,bool类型,默认值True
        ----------
        '''
        start = time.clock()
        #输入校验
        check_type('check_input', type(check_input), type(True))
        if check_input == True:
            check_type('show_time', type(show_time), type(True))
            X = self.check_input_X_(X)
            y = self.check_input_y_(y, mode=self.mode)
            check_index_match(X, y, 'X', 'y')
        #分类
        n, m = len(y), len(X.columns)
        if self.mode == 'c':
            #根据类别数量处理y
            values = y.drop_duplicates().sort_values().tolist()
            classes_n = len(values)
            if classes_n >= 0.5 * len(y):
                print('\nwarning: too many classes in y')
            if classes_n <= 1:
                raise ValueError('classes_n in y should >=2')
            self.classes = values
            #smo优化
            #二分类
            if classes_n == 2:
                y_ = np.ones(n)
                y_[y == values[0]] = -1
                p = np.ones(n)
                w,b,a,sv_X,sv_y,cost_h,optimize_h=\
                    self.smo_optimize_(X.values,y_,iter_max=self.iter_max,C=self.C,p=p,mode=self.mode,
                                       k_type=self.k_type,k_args=self.k_args,relax=self.relax)
                if keep_nonsv == False:
                    sv_idx = (a != 0)
                    self.a, self.sv_X, self.sv_y = [a[sv_idx]
                                                    ], [sv_X[sv_idx]
                                                        ], [sv_y[sv_idx]]
                else:
                    self.a, self.sv_X, self.sv_y = [a], [sv_X], [sv_y]
                self.w, self.b = [w], [b]
                self.cost_h, self.optimize_h = [cost_h], [optimize_h]
            #多分类
            else:
                Y = dp.dummy_var(y)
                self.w,self.b,self.a,self.sv_X,self.sv_y=[],[],[],[],[]
                self.cost_h, self.optimize_h = [], []
                p = np.ones(n)
                if self.multi_class == 'ovr':
                    for i in range(classes_n):
                        print('\nfitting classifier %d ---' % i)
                        y_ = Y.iloc[:, i].values
                        y_[y_ == 0] = -1
                        w,b,a,sv_X,sv_y,cost_h,optimize_h=\
                            self.smo_optimize_(X.values,y_,iter_max=self.iter_max,C=self.C,p=p,mode=self.mode,
                                               k_type=self.k_type,k_args=self.k_args,relax=self.relax)
                        if keep_nonsv == False:
                            sv_idx = (a != 0)
                            self.a.append(a[sv_idx]), self.sv_X.append(
                                sv_X[sv_idx]), self.sv_y.append(sv_y[sv_idx])
                        else:
                            self.a.append(a), self.sv_X.append(
                                sv_X), self.sv_y.append(sv_y)
                        self.w.append(w), self.b.append(b),
                        self.cost_h.append(cost_h), self.optimize_h.append(
                            optimize_h)
                elif self.multi_class == 'tree':
                    #生成所有节点
                    def split_classes(svm_id, classes):
                        classes_n = len(classes)
                        if classes_n == 2:
                            left_cid, right_cid = -1, -1
                        elif classes_n == 3:
                            left_cid, right_cid = -1, 10 * svm_id + 1
                        else:
                            left_cid, right_cid = 10 * svm_id + 1, 10 * svm_id + 2
                        nodes_info = [[
                            svm_id, classes[:classes_n // 2], 1, left_cid
                        ], [svm_id, classes[classes_n // 2:], -1, right_cid]]
                        if classes_n == 3:
                            nodes_info += split_classes(
                                10 * svm_id + 1, classes[classes_n // 2:])
                        elif classes_n > 3:
                            nodes_info += split_classes(
                                10 * svm_id + 1, classes[:classes_n // 2])
                            nodes_info += split_classes(
                                10 * svm_id + 2, classes[classes_n // 2:])
                        if svm_id == 1:
                            nodes_info = pd.DataFrame(
                                nodes_info,
                                columns=['svm', 'classes', 'y', 'next'])
                            svm_ids = nodes_info['svm'].drop_duplicates(
                            ).sort_values().tolist()
                            for i in range(len(svm_ids)):
                                nodes_info.loc[nodes_info['svm'] == svm_ids[i],
                                               'svm'] = i
                                nodes_info.loc[nodes_info['next'] ==
                                               svm_ids[i], 'next'] = i
                            return nodes_info
                        else:
                            return nodes_info

                    tree = split_classes(1, self.classes)
                    #每个节点训练一个分类器
                    for i in range(tree['svm'].max() + 1):
                        print('\nfitting classifier %d ---' % i)
                        left_classes = tree.loc[(tree['svm'] == i) &
                                                (tree['y'] == 1),
                                                'classes'].values[0]
                        right_classes = tree.loc[(tree['svm'] == i) &
                                                 (tree['y'] == -1),
                                                 'classes'].values[0]
                        classes_ = left_classes + right_classes
                        in_classes = (y.isin(classes_))
                        X_ = X[in_classes].values
                        y_ = y[in_classes].values
                        y__ = np.ones(len(y_))
                        y__[np.isin(y_, right_classes)] = -1
                        p_ = np.ones(len(y_))
                        w,b,a,sv_X,sv_y,cost_h,optimize_h=\
                            self.smo_optimize_(X_,y__,iter_max=self.iter_max,C=self.C,p=p_,mode=self.mode,
                                               k_type=self.k_type,k_args=self.k_args,relax=self.relax)
                        if keep_nonsv == False:
                            sv_idx = (a != 0)
                            self.a.append(a[sv_idx]), self.sv_X.append(
                                sv_X[sv_idx]), self.sv_y.append(sv_y[sv_idx])
                        else:
                            self.a.append(a), self.sv_X.append(
                                sv_X), self.sv_y.append(sv_y)
                        self.w.append(w), self.b.append(b)
                        self.cost_h.append(cost_h), self.optimize_h.append(
                            optimize_h)
                    self.tree = tree
                else:
                    raise ValueError('unsupported multi_class')
        #回归
        #参考了libsvm的实现,将原本要优化的两组a合并到一组进行优化
        elif self.mode == 'r':
            X_, y_, p = np.zeros((2 * n, m)), np.zeros(2 * n), np.zeros(2 * n)
            X_[:n, :], X_[n:, :] = X.values, X.values
            y_[:n], y_[n:] = 1, -1
            p[:n], p[n:] = -self.eps + y.values, -self.eps - y.values
            w,b,a_,sv_X_,sv_y_,cost_h,optimize_h=\
                self.smo_optimize_(X_,y_,iter_max=self.iter_max,C=self.C,p=p,mode=self.mode,
                                   k_type=self.k_type,k_args=self.k_args,relax=self.relax,y0=y.values)
            a = a_[:n] - a_[n:]
            sv_X, sv_y = X.values, np.ones(n)
            if keep_nonsv == False:
                sv_idx = (a != 0)
                self.a, self.sv_X, self.sv_y = [a[sv_idx]], [sv_X[sv_idx]
                                                             ], [sv_y[sv_idx]]
            else:
                self.a, self.sv_X, self.sv_y = [a], [sv_X], [sv_y]
            self.w, self.b = [w], [b]
            self.cost_h, self.optimize_h = [cost_h], [optimize_h]
        else:
            raise ValueError('unsupported mode')
        time_cost = time.clock() - start
        if show_time == True:
            print('\ntime used for training: %f' % time_cost)