def predict(self, X):
     start = time.clock()
     X = self.check_input_X_(X)
     features = self.X_d_p['feature'].drop_duplicates().tolist()
     check_items_match(X.columns,
                       features,
                       'X',
                       'P(X)',
                       'features',
                       mode='right')
     p_y_X = self.b_prob_(X, self.c_p, self.X_d_p, self.y_d_p)
     p_y_X_max = (p_y_X.T == p_y_X.max(axis=1)).T
     need_repair = np.where(p_y_X_max.sum(axis=1) > 1)[0]
     for i in need_repair:
         p_y_x_max = p_y_X_max[i, :]
         idx1 = np.where(p_y_x_max == 1)[0]
         keep = idx1[int(random.uniform(0, len(idx1)))]
         p_y_x_max[:] = 0
         p_y_x_max[keep] = 1
         p_y_X_max[i, :] = p_y_x_max
     classes = self.y_d_p['value'].values
     classes_idx = np.array(range(len(classes)))
     pred_y = np.dot(p_y_X_max, classes_idx).astype('int')
     pred_y = pd.Series(pred_y, name='classify', index=X.index)
     for i in range(len(classes)):
         pred_y[pred_y == i] = classes[i]
     print('\ntime used for predict: %f' % (time.clock() - start))
     return pred_y
 def selection(self,test_X,test_y,units=None,units_oob_score=None,
               use='oob',return_units=False,show_time=False):
     '''\n
     Function: 在生成好的模型上进行选择,筛选出集成单元的一个子集
     
     Notes: 作用类似于决策树的剪枝,通过一些规则生成可选子集,
            再通过在测试集上的表现选择最优的一个,能够得到更简单且泛化能力更强的模型
     
     Parameters
     ----------
     test_X: 测试集特征列,DataFrame类型
     test_y: 测试集目标列,Series类型
     units: 集成单元,list(DecitionTree)类型
     units_oob_score: 集成单元obb评分,list(float)类型
     use: 使用的选择方法,str类型,默认'oob'
          'rd'->随机选择,'oob'->oob选择
     return_units: 是否以返回值形式给到选择后的集成单元,bool类型,默认False
     show_time: 是否显示耗时,bool类型,默认值False
     ----------
     
     Returns
     -------
     0: 分类->准确率,回归->R方,float类型
     -------
     '''
     start = time.clock()
     if units==None:
         units=self.units
     if units_oob_score==None:
         units_oob_score=self.units_oob_score
     #输入校验
     check_type('units',type(units),type([]))
     check_type('element in units',type(units[0]),type(dt.DecisionTree()))
     check_type('units_oob_score',type(units_oob_score),type([]))
     check_type('element in units_oob_score',type(units_oob_score[0]),[type(0.0),np.float64])
     check_type('use',type(use),type(''))
     check_type('return_units',type(return_units),type(True))
     use_list=['rd','oob']
     check_limit('use',use in use_list,str(use_list))
     test_X,continuity_X=self.unit_test.check_input_X_(test_X,'test_X')
     test_y,continuity_y=self.unit_test.check_input_y_(test_y,'test_y')
     check_index_match(test_X,test_y,'test_X','test_y')
     features=[]
     for unit in units:
         features+=unit.tree.features
     features=list(set(features))
     check_items_match(test_X.columns,features,'test_X','tree','features',mode='right')
     #选择
     if use=='rd':
         subset=self.random_selection_(test_X,test_y,units)
     elif use=='oob':
         subset=self.oob_selection_(test_X,test_y,units,units_oob_score)
     end = time.clock()
     if show_time==True:
         print('\ntime used for selection:%f'%(end-start))
     if return_units==False:
         self.units=subset
     else:
         return subset
示例#3
0
 def predict(self, X, theta=None, show_time=False, check_input=True):
     '''\n
     Function: 对输入数据进行预测
     
     Note: theta参数不提供时直接使用内部存储
     
     Parameters
     ----------
     X: 特征矩阵,DataFrame类型
     theta: 参数向量,Series类型
     show_time: 是否显示时间开销,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: 预测值向量,Series类型
     -------
     '''
     start = time.clock()
     #外部传入theta或使用内部缓存
     if type(theta) == type(None):
         theta = self.theta
     #输入校验
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         X = self.check_input_X_(X)
         self.check_input_t_(theta)
         check_items_match(X.columns,
                           theta,
                           'features in X',
                           'theta',
                           'numbers',
                           mode='len')
     #预测
     p_y = pd.Series(self.linear_(X, theta), index=X.index)
     time_cost = time.clock() - start
     if show_time == True:
         print('\ntime used for predict: %f' % time_cost)
     return p_y
示例#4
0
 def predict(self,
             X,
             theta=None,
             classes=None,
             classes_paired=None,
             return_proba=False,
             show_time=False,
             check_input=True):
     '''\n
     Function: 对输入数据进行预测
     
     Note: theta,classes,classes_paired参数不提供时直接使用内部存储
     
     Parameters
     ----------
     X: 特征矩阵,DataFrame类型
     theta: 参数向量,Series类型
     classes: 类标签,list(str)类型
     classes_paired: 正负样本类选取,narray(m,n)类型
     return_proba: 是否返回分类概率,bool类型,默认False
     show_time: 是否显示时间开销,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: 预测值向量,Series类型
     -------
     '''
     start = time.clock()
     #外部传入参数或使用内部缓存
     if type(theta) == type(None):
         theta = self.theta
     if type(classes) == type(None):
         classes = self.classes
     if type(classes_paired) == type(None):
         classes_paired = self.classes_paired
     #输入校验
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         X = self.check_input_X_(X)
         self.check_input_t_(theta)
         check_items_match(X.columns,
                           theta,
                           'features in X',
                           'theta',
                           'numbers',
                           mode='len')
     #预测
     p_y = self.predict_(X, theta, classes_paired, return_proba)
     if return_proba == False:
         p_y = pd.Series(p_y, name='classify', index=X.index)
         for i in range(len(classes)):
             p_y[p_y == i] = classes[i]
         time_cost = time.clock() - start
         if show_time == True:
             print('\ntime used for predict: %f' % time_cost)
         return p_y
     else:
         time_cost = time.clock() - start
         if show_time == True:
             print('\ntime used for predict: %f' % time_cost)
         return pd.DataFrame(p_y, columns=classes, index=X.index)
 def predict(self,X,units=None,mode=None,classes=None,units_result=False,
             return_proba=False,return_paths=False,show_time=False):
     '''\n
     Function: 使用输入数据和所有集成单元进行预测,没有输入集成单元时使用内部缓存
     
     Parameters
     ----------
     X: 所有特征列,DataFrame类型
     units: 集成单元,list(DecitionTree)类型,默认调用内部缓存
     mode: 模式,分类->'c',回归->'r',默认'c'
     classes: 分类标签列表,list(str)类型
     units_result: 是否返回每个单元的分类结果,bool类型,默认False
     return_proba: 是否返回分类概率,分类模式下有效,bool类型,默认值False,
                   分类概率不能直接用于评估
     return_paths: 是否返回决策路径,bool类型,默认值False
                  (路径信息以str类型返回,可转换为list使用)
     show_time: 是否显示耗时,bool类型,默认值False
     ----------
     
     Returns
     -------
     0: 预测的分类/分类概率,Series/DataFrame类型
     1: 各个单元的预测的分类/分类概率,list(Series)/list(DataFrame)类型
     2: 所有数据最终抵达的节点和决策路径,list(DataFrame)类型
     -------
     '''
     start = time.clock()        
     #校验参数
     if type(units)==type(None):
         units=self.units
         
     if type(mode)==type(None):
         mode=self.mode
     check_type('mode',type(mode),type(''))
     mode_list=['c','r']
     check_limit('mode',mode in mode_list,str(mode_list))
     
     if (type(classes)==type(None))&(mode=='c'):
         classes=self.classes
         
     check_type('units',type(units),type([]))
     if len(units)==0:
         raise ValueError('lack of units')
     if mode=='r':
         check_type('element in units',type(units[0]),type(dt.DecisionTree()))
     elif mode=='c':
         check_type('element in units',type(units[0][0]),type(dt.DecisionTree()))
     
     check_type('return_proba',type(return_proba),type(True))
     check_type('return_paths',type(return_paths),type(True))
     check_type('show_time',type(show_time),type(True))
     
     X,continuity_X=self.unit_test.check_input_X_(X)
     features=[]
     if mode=='c':
         for units_ in units:
             for unit in units_:
                 features+=unit.tree.features
     elif mode=='r':
         for unit in units:
             features+=unit.tree.features
     features=list(set(features))
     check_items_match(X.columns,features,'X','unit','features',mode='right')
     #分类模式先求分类概率,回归模式直接求回归值
     n=len(X)
     if mode=='c':
         #定义存放分类结果的DataFrame
         p_y=pd.DataFrame(
                 np.zeros((n,len(classes))),
                 index=X.index,columns=classes)
     elif mode=='r':
         #定义存放回归值的Series
         p_y=pd.Series(np.zeros(n),index=X.index)
     #逐个调用每个单元进行预测,并将结果累加
     units_p_y,units_paths=[],[]
     for i in range(len(units)):
         if show_time==True:
             print('\npredicting with unit %d ---'%i)
         if mode=='r':
             if return_paths==True:
                 p_y_,paths=units[i].predict(X,return_proba=True,return_paths=True,
                                            show_time=show_time,check_input=False)
                 units_paths.append(paths)
             else:
                 p_y_=units[i].predict(X,return_proba=True,return_paths=False,
                                      show_time=show_time,check_input=False)
             p_y+=p_y_
             if units_result==True:
                 if (mode=='c')&(return_proba==False):
                     p_y_=units[i].choose_class_(p_y_,classes)
                 units_p_y.append(p_y_)
         #分类模式需要调用子单元对每个类的概率进行预测
         elif mode=='c':
             classes_p_y,classes_paths=[],[]
             for j in range(len(classes)):
                 if return_paths==True:
                     p_y_,paths=units[i][j].predict(X,return_proba=True,return_paths=True,
                                                    show_time=show_time,check_input=False)
                     classes_paths.append(paths)
                 else:
                     p_y_=units[i][j].predict(X,return_proba=True,return_paths=False,
                                              show_time=show_time,check_input=False)
                 p_y.iloc[:,j]+=p_y_
                 if units_result==True:
                     if (mode=='c')&(return_proba==False):
                         p_y_=units[i].choose_class_(p_y_,classes)
                     classes_p_y.append(p_y_)
             if return_paths==True:
                 units_paths.append(classes_paths)
             if units_result==True:
                 units_p_y.append(classes_p_y)
     #返回分类概率或唯一分类
     if (mode=='c')&(return_proba==False):
         p_y=self.unit_test.choose_class_(p_y,classes)
     end = time.clock()
     if show_time==True:
         print('\ntotal time used for predict: %f'%(end-start))
     if units_result==True:
         if return_paths==True:
             return p_y,units_p_y,paths
         else:
             return p_y,units_p_y
     else:
         if return_paths==True:
             return p_y,paths
         else:
             return p_y
示例#6
0
 def predict(self, X, return_u=False, show_time=False, check_input=True):
     '''\n
     Function: 对输入数据进行预测
     
     Parameters
     ----------
     X: 特征矩阵,DataFrame类型
     return_u: 是否返回函数间隔,bool类型,默认False
     show_time: 是否显示时间开销,bool类型,默认False
     check_input: 是否进行输入校验,bool类型,默认值True
     ----------
     
     Returns
     -------
     0: 预测值向量,Series类型
     -------
     '''
     start = time.clock()
     #输入校验
     check_type('check_input', type(check_input), type(True))
     if check_input == True:
         X = self.check_input_X_(X)
         check_items_match(X.columns,
                           self.sv_X[0][0, :],
                           'features in X',
                           'support vector',
                           'numbers',
                           mode='len')
     #计算函数间隔
     if self.mode == 'c':
         classes_n = len(self.classes)
         classifiers_n = len(self.a)
         #二分类
         if classes_n == 2:
             u = self.decision_(X.values, self.a[0], self.sv_y[0],
                                self.sv_X[0], self.b[0], self.k_type,
                                self.k_args)
         #多分类
         else:
             u = np.zeros((len(X), classes_n))
             #ovr
             if classifiers_n == classes_n:
                 for i in range(classes_n):
                     u_ = self.decision_(X.values, self.a[i], self.sv_y[i],
                                         self.sv_X[i], self.b[i],
                                         self.k_type, self.k_args)
                     u[:, i] += u_
             #tree
             elif classifiers_n < classes_n:
                 tree = self.tree
                 flow = np.zeros(len(X)).astype('int')
                 for i in range(len(self.a)):
                     ft = (flow == i)
                     X_ = X[ft].values
                     u_ = self.decision_(X_, self.a[i], self.sv_y[i],
                                         self.sv_X[i], self.b[i],
                                         self.k_type, self.k_args)
                     left_child = tree.loc[(tree['svm'] == i) &
                                           (tree['y'] == 1),
                                           'next'].values[0]
                     right_child = tree.loc[(tree['svm'] == i) &
                                            (tree['y'] == -1),
                                            'next'].values[0]
                     left_classes = tree.loc[(tree['svm'] == i) &
                                             (tree['y'] == 1),
                                             'classes'].values[0]
                     right_classes = tree.loc[(tree['svm'] == i) &
                                              (tree['y'] == -1),
                                              'classes'].values[0]
                     flow_ = flow[ft]
                     if left_child != -1:
                         flow_[u_ >= 0] = left_child
                     else:
                         class_idx = self.classes.index(left_classes[0])
                         u__ = np.zeros(len(X_))
                         u__[u_ >= 0] = 1.
                         u[ft, class_idx] += u__
                     if right_child != -1:
                         flow_[u_ < 0] = right_child
                     else:
                         class_idx = self.classes.index(right_classes[0])
                         u__ = np.zeros(len(X_))
                         u__[u_ < 0] = 1.
                         u[ft, class_idx] += u__
                     flow[flow == i] = flow_
                 self.flow = flow
             else:
                 raise ValueError('too many classifiers')
         if return_u == False:
             #划分类别
             if classes_n == 2:
                 p_y = self.devide_(u)
                 p_y[p_y == 1] = self.classes[1]
                 p_y[p_y == -1] = self.classes[0]
             else:
                 u_max = u.max(axis=1)
                 max_idx = (u.T == u_max).T.astype('int')
                 classes_idx = np.array(range(u.shape[1]))
                 p_y_ = np.dot(max_idx, classes_idx).astype('int')
                 p_y = np.full(len(p_y_), '')
                 for i in range(classes_n):
                     p_y[p_y_ == i] = self.classes[i]
             p_y = pd.Series(p_y, index=X.index)
             time_cost = time.clock() - start
             if show_time == True:
                 print('\ntime used for predict: %f' % time_cost)
             return p_y
         else:
             if classes_n == 2:
                 u = pd.DataFrame(np.c_[u, -u],
                                  columns=self.classes,
                                  index=X.index)
             else:
                 u = pd.DataFrame(u, columns=self.classes, index=X.index)
             time_cost = time.clock() - start
             if show_time == True:
                 print('\ntime used for predict: %f' % time_cost)
             return u
     elif self.mode == 'r':
         u = self.decision_(X.values, self.a[0], self.sv_y[0], self.sv_X[0],
                            self.b[0], self.k_type, self.k_args)
         u = pd.Series(u, index=X.index)
         time_cost = time.clock() - start
         if show_time == True:
             print('\ntime used for predict: %f' % time_cost)
         return u
     else:
         raise ValueError('unsupported mode')