def classify(inputTree, features, testVec): #这里的inputTree就是决策树的序列化表示方法,python中是字典类型,也可以看做是json类型 firstStr = inputTree.keys()[0] #获取决策树的当前分割属性 secondDict = inputTree[firstStr] #当前分割节点下面的一堆树枝+节点 featIndex = features.index(firstStr) #当前是第几个特征 key = testVec[featIndex] #根据划分属性的下标来获取测试数据的对应下标的属性的具体取值 #--------------获得子树-------------------------------- if isinstance(key, str): #如果是离散特征 # print"secondDict=",secondDict valueOfFeat = secondDict[key] #根据这个值来顺着树枝key选择子树secondDict[key](离散特征) else: item_lists = [] for item in secondDict: item_lists.append(item) common_str = getNumofCommonSubstr(item_lists[0], item_lists[1])[0] #common_str是 # print"item_lists=",item_lists if key <= float(common_str): key = "<=" + common_str valueOfFeat = secondDict[key] else: key = ">" + common_str valueOfFeat = secondDict[key] #----------------获得子树------------------------------ if isinstance(valueOfFeat, dict): #如果是子树 classLabel = classify(valueOfFeat, features, testVec) #递归调用 else: #如果是叶子节点 classLabel = valueOfFeat return classLabel #递归函数的结束条件
def chi_square_continuous(bestfeature,branch_names,sub_datas,feature_list): data=copy.deepcopy(sub_datas) # print"feature_list=",feature_list feature_index=feature_list.index(bestfeature) feature_list.append('class') feature=bestfeature #根据《概率论与数理统计》中的双因素检验可知,需要把连续特征转化为有限个“取值水平” threshold=getNumofCommonSubstr(branch_names[0], branch_names[1])[0] threshold=float(threshold) # print "feature_index=",feature_index for index,item in enumerate(data): try: if float(item[feature_index])<=threshold: data[index][feature_index]="<="+str(threshold)#这里的用意是把连续特征改为因素的不同水平,便于计算自由度,由于C4.5对连续特征的处理是“二分处理”,所以这里改成两个水平,"≥"和“小于” else: data[index][feature_index]=">"+str(threshold) except: pass # Contingency table. feature=bestfeature datas = pd.DataFrame(data, columns=feature_list) contingency = pd.crosstab(datas[feature], datas["class"]) #这个应该是指定特征和最后一列类别列之间的卡方检验计算 # Chi-square test of independence. chi2, p_value, Degree_Freedom, expected_Xij = chi2_contingency(contingency) # print "Degree_Freedom=",Degree_Freedom # print"当期数据的长度=",len(datas) return chi2