Exemplo n.º 1
0
def classify(inputTree, features,
             testVec):  #这里的inputTree就是决策树的序列化表示方法,python中是字典类型,也可以看做是json类型
    firstStr = inputTree.keys()[0]  #获取决策树的当前分割属性
    secondDict = inputTree[firstStr]  #当前分割节点下面的一堆树枝+节点

    featIndex = features.index(firstStr)  #当前是第几个特征
    key = testVec[featIndex]  #根据划分属性的下标来获取测试数据的对应下标的属性的具体取值

    #--------------获得子树--------------------------------
    if isinstance(key, str):  #如果是离散特征
        # print"secondDict=",secondDict
        valueOfFeat = secondDict[key]  #根据这个值来顺着树枝key选择子树secondDict[key](离散特征)
    else:
        item_lists = []
        for item in secondDict:
            item_lists.append(item)
        common_str = getNumofCommonSubstr(item_lists[0],
                                          item_lists[1])[0]  #common_str是
        # print"item_lists=",item_lists
        if key <= float(common_str):
            key = "<=" + common_str
            valueOfFeat = secondDict[key]
        else:
            key = ">" + common_str
            valueOfFeat = secondDict[key]

#----------------获得子树------------------------------
    if isinstance(valueOfFeat, dict):  #如果是子树
        classLabel = classify(valueOfFeat, features, testVec)  #递归调用
    else:  #如果是叶子节点
        classLabel = valueOfFeat
    return classLabel  #递归函数的结束条件
def chi_square_continuous(bestfeature,branch_names,sub_datas,feature_list):
    data=copy.deepcopy(sub_datas)
    # print"feature_list=",feature_list
    feature_index=feature_list.index(bestfeature)
    feature_list.append('class')

    feature=bestfeature
    #根据《概率论与数理统计》中的双因素检验可知,需要把连续特征转化为有限个“取值水平”
    threshold=getNumofCommonSubstr(branch_names[0], branch_names[1])[0]
    threshold=float(threshold)
    # print "feature_index=",feature_index
    for index,item in enumerate(data):
        try:
            if float(item[feature_index])<=threshold:
                data[index][feature_index]="<="+str(threshold)#这里的用意是把连续特征改为因素的不同水平,便于计算自由度,由于C4.5对连续特征的处理是“二分处理”,所以这里改成两个水平,"≥"和“小于”
            else:
                data[index][feature_index]=">"+str(threshold)
        except:
            pass


    # Contingency table.
    feature=bestfeature
    datas = pd.DataFrame(data, columns=feature_list)
    contingency = pd.crosstab(datas[feature], datas["class"])
    #这个应该是指定特征和最后一列类别列之间的卡方检验计算


    # Chi-square test of independence.
    chi2, p_value, Degree_Freedom, expected_Xij = chi2_contingency(contingency)
    # print "Degree_Freedom=",Degree_Freedom
    # print"当期数据的长度=",len(datas)
    return chi2