Пример #1
0
def calEntropy(confidition, feature):
    h_feature = {}
    h_a_feature = {}
    data = select_table(confidition)
    cases = len(data)
    num_good = float(data.count(('是', )))
    num_bad = float(len(data) - num_good)
    hd = - (math.log(num_good/cases,2)*\
            num_good/cases +\
            math.log(num_bad/cases,2)*num_bad/cases)
    for idx in feature:
        labels = []
        if idx == "编号" or idx == "好瓜":
            continue
        for label in select_table(confidition, label=idx, distict=True):
            if len(label) == 1:
                labels.append(label[0])
        for label in labels:
            now_confidition = confidition
            if not now_confidition:
                now_confidition = " where "
            else:
                now_confidition = now_confidition + " and "
            now_confidition = now_confidition + " " + idx + " == '" + label + "'"
            feature_data = select_table(condition=now_confidition)
            #print("condition",now_confidition)
            k_ = float(len(feature_data)) / cases
            label_good = float(feature_data.count(('是', )))
            #print("feature_data",feature_data)
            m_ = float(label_good) / len(feature_data)  # 好瓜率
            # 只有好瓜,坏瓜两种情况,直接加
            hk = 0
            hf = 0
            if label_good != 0:  # good
                hk = m_ * math.log(m_, 2)
            if len(feature_data) - label_good != 0:  # bad
                hk = hk + (1 - m_) * math.log((1 - m_), 2)
            h_feature[idx] = h_feature.get(idx, 0) + k_ * hk
            if k_:
                h_a_feature[idx] = h_a_feature.get(idx,
                                                   0) + k_ * math.log(k_, 2)
        h_feature[idx] = h_feature.get(idx, 0) * -1
        h_a_feature[idx] = h_a_feature.get(idx, 0) * -1
        h_feature[idx] = (hd - h_feature[idx]) / h_a_feature[idx]
    return h_feature
Пример #2
0
def calGini(confidition, feature):
    gini_name = {}
    gini_num = {}

    data = select_table(confidition)
    cases = len(data)
    num_good = float(data.count(('是', )))

    for idx in feature:
        labels = []
        if idx == "编号" or idx == "好瓜":
            continue
        for label in select_table(confidition, label=idx, distict=True):
            if len(label) == 1:
                labels.append(label[0])
        minGini = 999
        minName = ""
        for label in labels:
            temp = 999
            now_confidition = confidition
            not_confidition = confidition
            if not now_confidition:
                now_confidition = " where "
                not_confidition = " where "
            else:
                now_confidition = now_confidition + " and "
                not_confidition = not_confidition + " and "
            now_confidition = now_confidition + " " + idx + " = '" + label + "'"
            not_confidition = not_confidition + " " + idx + " != '" + label + "'"
            feature_data = select_table(condition=now_confidition)
            other_feature_data = select_table(condition=not_confidition)
            #print("condition",now_confidition)
            k_ = float(len(feature_data)) / cases
            label_good = float(feature_data.count(('是', )))
            other_label_good = float(other_feature_data.count(('是', )))
            other_ = float(other_label_good) / len(other_feature_data)
            m_ = float(label_good) / len(feature_data)  # 好瓜率
            temp = k_ * (2 * m_ * (1 - m_)) + (1 - k_) * (2 * other_ *
                                                          (1 - other_))
            if temp < minGini:
                minGini = temp
                minName = label
        gini_name[idx] = minName
        gini_num[idx] = minGini
    return gini_num, gini_name
Пример #3
0
def checkSameLable(confidition):
    data = select_table(condition=confidition)
    cases = len(data)
    num_good = float(data.count(('是', )))
    if num_good == cases:
        return True, '好瓜'
    elif num_good == 0:
        return True, '坏瓜'
    else:
        return False, None
Пример #4
0
def buildId3Tree(confidition, root, feature):
    # 移除 root from 特征集
    global cnt
    feature.remove(root)
    # get labels
    labels = []
    for label in select_table(confidition, label=root, distict=True):
        if len(label) == 1:
            labels.append(label[0])
    for label in labels:
        now_confidition = confidition
        if not now_confidition:
            now_confidition = " where "
        else:
            now_confidition = now_confidition + " and "
        now_confidition = now_confidition + " " + root + " = '" + label + "'"
        # 如果属于同一类返回
        flag, nflag = checkSameLable(now_confidition)
        if flag:
            nflag = "LeafNode:" + str(cnt) + " " + nflag
            G.add_node(nflag, fontname="SimHei")
            G.add_edge(root,
                       nflag,
                       label=label,
                       fontname="SimHei",
                       color="black",
                       style="dashed",
                       penwidth=1.5)
            cnt = cnt + 1
            continue
        # 计算增益比
        h_feature = calEntropy(now_confidition, feature)
        # 选增益比最大的特征作为节点
        maxidx = max(h_feature, key=h_feature.get)
        print("maxidx=", maxidx)
        # 以该节点为根构建子树
        G.add_node((maxidx), label=label)
        G.add_edge(root, maxidx, label=label)
        buildId3Tree(now_confidition, root, feature)