def predict(self, X): start = time.clock() X = self.check_input_X_(X) features = self.X_d_p['feature'].drop_duplicates().tolist() check_items_match(X.columns, features, 'X', 'P(X)', 'features', mode='right') p_y_X = self.b_prob_(X, self.c_p, self.X_d_p, self.y_d_p) p_y_X_max = (p_y_X.T == p_y_X.max(axis=1)).T need_repair = np.where(p_y_X_max.sum(axis=1) > 1)[0] for i in need_repair: p_y_x_max = p_y_X_max[i, :] idx1 = np.where(p_y_x_max == 1)[0] keep = idx1[int(random.uniform(0, len(idx1)))] p_y_x_max[:] = 0 p_y_x_max[keep] = 1 p_y_X_max[i, :] = p_y_x_max classes = self.y_d_p['value'].values classes_idx = np.array(range(len(classes))) pred_y = np.dot(p_y_X_max, classes_idx).astype('int') pred_y = pd.Series(pred_y, name='classify', index=X.index) for i in range(len(classes)): pred_y[pred_y == i] = classes[i] print('\ntime used for predict: %f' % (time.clock() - start)) return pred_y
def selection(self,test_X,test_y,units=None,units_oob_score=None, use='oob',return_units=False,show_time=False): '''\n Function: 在生成好的模型上进行选择,筛选出集成单元的一个子集 Notes: 作用类似于决策树的剪枝,通过一些规则生成可选子集, 再通过在测试集上的表现选择最优的一个,能够得到更简单且泛化能力更强的模型 Parameters ---------- test_X: 测试集特征列,DataFrame类型 test_y: 测试集目标列,Series类型 units: 集成单元,list(DecitionTree)类型 units_oob_score: 集成单元obb评分,list(float)类型 use: 使用的选择方法,str类型,默认'oob' 'rd'->随机选择,'oob'->oob选择 return_units: 是否以返回值形式给到选择后的集成单元,bool类型,默认False show_time: 是否显示耗时,bool类型,默认值False ---------- Returns ------- 0: 分类->准确率,回归->R方,float类型 ------- ''' start = time.clock() if units==None: units=self.units if units_oob_score==None: units_oob_score=self.units_oob_score #输入校验 check_type('units',type(units),type([])) check_type('element in units',type(units[0]),type(dt.DecisionTree())) check_type('units_oob_score',type(units_oob_score),type([])) check_type('element in units_oob_score',type(units_oob_score[0]),[type(0.0),np.float64]) check_type('use',type(use),type('')) check_type('return_units',type(return_units),type(True)) use_list=['rd','oob'] check_limit('use',use in use_list,str(use_list)) test_X,continuity_X=self.unit_test.check_input_X_(test_X,'test_X') test_y,continuity_y=self.unit_test.check_input_y_(test_y,'test_y') check_index_match(test_X,test_y,'test_X','test_y') features=[] for unit in units: features+=unit.tree.features features=list(set(features)) check_items_match(test_X.columns,features,'test_X','tree','features',mode='right') #选择 if use=='rd': subset=self.random_selection_(test_X,test_y,units) elif use=='oob': subset=self.oob_selection_(test_X,test_y,units,units_oob_score) end = time.clock() if show_time==True: print('\ntime used for selection:%f'%(end-start)) if return_units==False: self.units=subset else: return subset
def predict(self, X, theta=None, show_time=False, check_input=True): '''\n Function: 对输入数据进行预测 Note: theta参数不提供时直接使用内部存储 Parameters ---------- X: 特征矩阵,DataFrame类型 theta: 参数向量,Series类型 show_time: 是否显示时间开销,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 预测值向量,Series类型 ------- ''' start = time.clock() #外部传入theta或使用内部缓存 if type(theta) == type(None): theta = self.theta #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: X = self.check_input_X_(X) self.check_input_t_(theta) check_items_match(X.columns, theta, 'features in X', 'theta', 'numbers', mode='len') #预测 p_y = pd.Series(self.linear_(X, theta), index=X.index) time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return p_y
def predict(self, X, theta=None, classes=None, classes_paired=None, return_proba=False, show_time=False, check_input=True): '''\n Function: 对输入数据进行预测 Note: theta,classes,classes_paired参数不提供时直接使用内部存储 Parameters ---------- X: 特征矩阵,DataFrame类型 theta: 参数向量,Series类型 classes: 类标签,list(str)类型 classes_paired: 正负样本类选取,narray(m,n)类型 return_proba: 是否返回分类概率,bool类型,默认False show_time: 是否显示时间开销,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 预测值向量,Series类型 ------- ''' start = time.clock() #外部传入参数或使用内部缓存 if type(theta) == type(None): theta = self.theta if type(classes) == type(None): classes = self.classes if type(classes_paired) == type(None): classes_paired = self.classes_paired #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: X = self.check_input_X_(X) self.check_input_t_(theta) check_items_match(X.columns, theta, 'features in X', 'theta', 'numbers', mode='len') #预测 p_y = self.predict_(X, theta, classes_paired, return_proba) if return_proba == False: p_y = pd.Series(p_y, name='classify', index=X.index) for i in range(len(classes)): p_y[p_y == i] = classes[i] time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return p_y else: time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return pd.DataFrame(p_y, columns=classes, index=X.index)
def predict(self,X,units=None,mode=None,classes=None,units_result=False, return_proba=False,return_paths=False,show_time=False): '''\n Function: 使用输入数据和所有集成单元进行预测,没有输入集成单元时使用内部缓存 Parameters ---------- X: 所有特征列,DataFrame类型 units: 集成单元,list(DecitionTree)类型,默认调用内部缓存 mode: 模式,分类->'c',回归->'r',默认'c' classes: 分类标签列表,list(str)类型 units_result: 是否返回每个单元的分类结果,bool类型,默认False return_proba: 是否返回分类概率,分类模式下有效,bool类型,默认值False, 分类概率不能直接用于评估 return_paths: 是否返回决策路径,bool类型,默认值False (路径信息以str类型返回,可转换为list使用) show_time: 是否显示耗时,bool类型,默认值False ---------- Returns ------- 0: 预测的分类/分类概率,Series/DataFrame类型 1: 各个单元的预测的分类/分类概率,list(Series)/list(DataFrame)类型 2: 所有数据最终抵达的节点和决策路径,list(DataFrame)类型 ------- ''' start = time.clock() #校验参数 if type(units)==type(None): units=self.units if type(mode)==type(None): mode=self.mode check_type('mode',type(mode),type('')) mode_list=['c','r'] check_limit('mode',mode in mode_list,str(mode_list)) if (type(classes)==type(None))&(mode=='c'): classes=self.classes check_type('units',type(units),type([])) if len(units)==0: raise ValueError('lack of units') if mode=='r': check_type('element in units',type(units[0]),type(dt.DecisionTree())) elif mode=='c': check_type('element in units',type(units[0][0]),type(dt.DecisionTree())) check_type('return_proba',type(return_proba),type(True)) check_type('return_paths',type(return_paths),type(True)) check_type('show_time',type(show_time),type(True)) X,continuity_X=self.unit_test.check_input_X_(X) features=[] if mode=='c': for units_ in units: for unit in units_: features+=unit.tree.features elif mode=='r': for unit in units: features+=unit.tree.features features=list(set(features)) check_items_match(X.columns,features,'X','unit','features',mode='right') #分类模式先求分类概率,回归模式直接求回归值 n=len(X) if mode=='c': #定义存放分类结果的DataFrame p_y=pd.DataFrame( np.zeros((n,len(classes))), index=X.index,columns=classes) elif mode=='r': #定义存放回归值的Series p_y=pd.Series(np.zeros(n),index=X.index) #逐个调用每个单元进行预测,并将结果累加 units_p_y,units_paths=[],[] for i in range(len(units)): if show_time==True: print('\npredicting with unit %d ---'%i) if mode=='r': if return_paths==True: p_y_,paths=units[i].predict(X,return_proba=True,return_paths=True, show_time=show_time,check_input=False) units_paths.append(paths) else: p_y_=units[i].predict(X,return_proba=True,return_paths=False, show_time=show_time,check_input=False) p_y+=p_y_ if units_result==True: if (mode=='c')&(return_proba==False): p_y_=units[i].choose_class_(p_y_,classes) units_p_y.append(p_y_) #分类模式需要调用子单元对每个类的概率进行预测 elif mode=='c': classes_p_y,classes_paths=[],[] for j in range(len(classes)): if return_paths==True: p_y_,paths=units[i][j].predict(X,return_proba=True,return_paths=True, show_time=show_time,check_input=False) classes_paths.append(paths) else: p_y_=units[i][j].predict(X,return_proba=True,return_paths=False, show_time=show_time,check_input=False) p_y.iloc[:,j]+=p_y_ if units_result==True: if (mode=='c')&(return_proba==False): p_y_=units[i].choose_class_(p_y_,classes) classes_p_y.append(p_y_) if return_paths==True: units_paths.append(classes_paths) if units_result==True: units_p_y.append(classes_p_y) #返回分类概率或唯一分类 if (mode=='c')&(return_proba==False): p_y=self.unit_test.choose_class_(p_y,classes) end = time.clock() if show_time==True: print('\ntotal time used for predict: %f'%(end-start)) if units_result==True: if return_paths==True: return p_y,units_p_y,paths else: return p_y,units_p_y else: if return_paths==True: return p_y,paths else: return p_y
def predict(self, X, return_u=False, show_time=False, check_input=True): '''\n Function: 对输入数据进行预测 Parameters ---------- X: 特征矩阵,DataFrame类型 return_u: 是否返回函数间隔,bool类型,默认False show_time: 是否显示时间开销,bool类型,默认False check_input: 是否进行输入校验,bool类型,默认值True ---------- Returns ------- 0: 预测值向量,Series类型 ------- ''' start = time.clock() #输入校验 check_type('check_input', type(check_input), type(True)) if check_input == True: X = self.check_input_X_(X) check_items_match(X.columns, self.sv_X[0][0, :], 'features in X', 'support vector', 'numbers', mode='len') #计算函数间隔 if self.mode == 'c': classes_n = len(self.classes) classifiers_n = len(self.a) #二分类 if classes_n == 2: u = self.decision_(X.values, self.a[0], self.sv_y[0], self.sv_X[0], self.b[0], self.k_type, self.k_args) #多分类 else: u = np.zeros((len(X), classes_n)) #ovr if classifiers_n == classes_n: for i in range(classes_n): u_ = self.decision_(X.values, self.a[i], self.sv_y[i], self.sv_X[i], self.b[i], self.k_type, self.k_args) u[:, i] += u_ #tree elif classifiers_n < classes_n: tree = self.tree flow = np.zeros(len(X)).astype('int') for i in range(len(self.a)): ft = (flow == i) X_ = X[ft].values u_ = self.decision_(X_, self.a[i], self.sv_y[i], self.sv_X[i], self.b[i], self.k_type, self.k_args) left_child = tree.loc[(tree['svm'] == i) & (tree['y'] == 1), 'next'].values[0] right_child = tree.loc[(tree['svm'] == i) & (tree['y'] == -1), 'next'].values[0] left_classes = tree.loc[(tree['svm'] == i) & (tree['y'] == 1), 'classes'].values[0] right_classes = tree.loc[(tree['svm'] == i) & (tree['y'] == -1), 'classes'].values[0] flow_ = flow[ft] if left_child != -1: flow_[u_ >= 0] = left_child else: class_idx = self.classes.index(left_classes[0]) u__ = np.zeros(len(X_)) u__[u_ >= 0] = 1. u[ft, class_idx] += u__ if right_child != -1: flow_[u_ < 0] = right_child else: class_idx = self.classes.index(right_classes[0]) u__ = np.zeros(len(X_)) u__[u_ < 0] = 1. u[ft, class_idx] += u__ flow[flow == i] = flow_ self.flow = flow else: raise ValueError('too many classifiers') if return_u == False: #划分类别 if classes_n == 2: p_y = self.devide_(u) p_y[p_y == 1] = self.classes[1] p_y[p_y == -1] = self.classes[0] else: u_max = u.max(axis=1) max_idx = (u.T == u_max).T.astype('int') classes_idx = np.array(range(u.shape[1])) p_y_ = np.dot(max_idx, classes_idx).astype('int') p_y = np.full(len(p_y_), '') for i in range(classes_n): p_y[p_y_ == i] = self.classes[i] p_y = pd.Series(p_y, index=X.index) time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return p_y else: if classes_n == 2: u = pd.DataFrame(np.c_[u, -u], columns=self.classes, index=X.index) else: u = pd.DataFrame(u, columns=self.classes, index=X.index) time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return u elif self.mode == 'r': u = self.decision_(X.values, self.a[0], self.sv_y[0], self.sv_X[0], self.b[0], self.k_type, self.k_args) u = pd.Series(u, index=X.index) time_cost = time.clock() - start if show_time == True: print('\ntime used for predict: %f' % time_cost) return u else: raise ValueError('unsupported mode')