示例#1
0
def CCP_cross_validation(TreeSets, alpha_list, X_test, y_test, feature_names,
                         class_names, sklearn_model):
    precision_list = []
    progress_length = len(TreeSets)
    # print"------------------------------检查下这里------------------------------"
    # print"X_test,y_test=",X_test
    # print y_test
    for index, item in enumerate(TreeSets):
        Ti_precision = precision_compute(item, X_test, y_test, feature_names,
                                         class_names)
        print "T%d_precision=%f" % (index, Ti_precision)
        precision_list.append(Ti_precision)
        print "the T" + str(index) + " has been validated, " + str(
            progress_length - index - 1) + " Trees left, wait please....."

    pruned_precision = max(precision_list)
    index = precision_list.index(pruned_precision)
    print "index=", index
    best_alpha = alpha_list[index]
    Best_tree = TreeSets[index]
    dot_file = "./visualization/Best_tree.dot"
    svg_file = "./visualization/Best_tree.svg"
    #画一画树

    best_sklearn_model = copy.deepcopy(sklearn_model)
    prune_sklearn_model(best_sklearn_model.tree_, 0, Best_tree)

    draw_file(best_sklearn_model, dot_file, svg_file, feature_names)
    return Best_tree, best_alpha, pruned_precision
def CCP_validation(TreeSets,alpha_list,X_test,y_test,feature_names,class_names,sklearn_model,b_SE):
    precision_list=[]
    progress_length=len(TreeSets)
    # print"------------------------------检查下这里------------------------------"
    # print"X_test,y_test=",X_test
    # print y_test
    for index,item in enumerate(TreeSets):
        Ti_precision=precision_compute(item,X_test,y_test,feature_names,class_names)
        print"T%d_precision=%f"%(index,Ti_precision)
        precision_list.append(Ti_precision)
        print"the T"+str(index)+" has been validated, "+str(progress_length-index-1)+" Trees left, wait please....."
    if b_SE==False:
        pruned_precision=max(precision_list)
        index=precision_list.index(pruned_precision)
        print"index=",index
        best_alpha=alpha_list[index]
        Best_tree=TreeSets[index]
        dot_file="./visualization/Best_tree_0SE.dot"
        svg_file="./visualization/Best_tree_0SE.svg"
        #画一画树
    
        best_sklearn_model=copy.deepcopy(sklearn_model)
        prune_sklearn_model(best_sklearn_model.tree_,0,Best_tree)
    
        draw_file(best_sklearn_model,dot_file,svg_file,feature_names)
        return Best_tree,best_alpha,pruned_precision,precision_list[0]

    else:#使用1-SE rule
        error_rate_list=[1-item for item in precision_list]
        lowest_error_rate=min(error_rate_list)
        print"error_rate_list=",error_rate_list
        SE=sqrt(lowest_error_rate*(1-lowest_error_rate)/len(y_test))
        print"SE=",SE

        criterion_1_SE=lowest_error_rate+SE

        index_error_rate=0
        for index,item in enumerate(error_rate_list):#search from from the end ,because the error_rate_list is not monotory.

            if error_rate_list[len(error_rate_list)-1-index]<criterion_1_SE:
                index_error_rate=len(error_rate_list)-1-index
                break

        # if index_error_rate-1>=0:
        #     index_error_rate=index_error_rate-1
        # else:
        #     pass#becasuse the list may only have one item.

        pruned_precision=precision_list[index_error_rate]#here's right,because the precision list is corresponding to the error_rate_list.

        best_alpha=alpha_list[index_error_rate]
        Best_tree=TreeSets[index_error_rate]
        dot_file="./visualization/Best_tree_1SE.dot"
        svg_file="./visualization/Best_tree_1SE.svg"
        #画一画树
        best_sklearn_model=copy.deepcopy(sklearn_model)
        prune_sklearn_model(best_sklearn_model.tree_,0,Best_tree)
    
        draw_file(best_sklearn_model,dot_file,svg_file,feature_names)
        return Best_tree,best_alpha,pruned_precision,precision_list[0]
def ECP_1SE_validation(TreeSets, alpha_list, X_test, y_test, feature_names,
                       sklearn_model, b_SE):
    mse_list = []
    progress_length = len(TreeSets)
    # print"------------------------------检查下这里------------------------------"
    # print"X_test,y_test=",X_test
    # print y_test
    for index, item in enumerate(TreeSets):
        Ti_mse = mse_compute(item, X_test, y_test, feature_names)
        print "T%d_mse=%f" % (index, Ti_mse)
        mse_list.append(Ti_mse)
        print "the T" + str(index) + " has been validated, " + str(
            progress_length - index - 1) + " Trees left, wait please....."
    if b_SE == False:
        pruned_mse = min(mse_list)
        index = mse_list.index(pruned_mse)
        # print"index=",index
        #------------------代码①处(start)-------------------
        best_alpha = alpha_list[index]
        Best_tree = TreeSets[index]
        dot_file = "./visualization/Best_tree_0SE.dot"
        svg_file = "./visualization/Best_tree_0SE.svg"
        #画一画树
        print "unpruned_mse=", mse_list[0]
        best_sklearn_model = copy.deepcopy(sklearn_model)
        prune_sklearn_model(best_sklearn_model.tree_, 0, Best_tree)

        draw_file(best_sklearn_model, dot_file, svg_file, feature_names)
        return Best_tree, best_alpha, pruned_mse, mse_list[0]


#------------------代码①处(end)-------------------
    else:
        min_mse = min(mse_list)
        SE = sqrt(min_mse * (1 - min_mse) / len(y_test))
        criterion_1_SE = min_mse + SE
        index_mse = 0
        for index, item in enumerate(mse_list):
            if mse_list[
                    len(mse_list) - 1 -
                    index] < criterion_1_SE:  #the mse_list is not Monotonous,so search from the end.
                index_mse = len(mse_list) - 1 - index
                break

        pruned_mse = mse_list[index_mse]
        #------------------下面代码与上面①一致(start)-------------------
        best_alpha = alpha_list[index_mse]
        Best_tree = TreeSets[index_mse]
        dot_file = "./visualization/Best_tree_1SE.dot"
        svg_file = "./visualization/Best_tree_1SE.svg"
        #画一画树
        print "unpruned_mse=", mse_list[0]
        best_sklearn_model = copy.deepcopy(sklearn_model)
        prune_sklearn_model(best_sklearn_model.tree_, 0, Best_tree)

        draw_file(best_sklearn_model, dot_file, svg_file, feature_names)
        return Best_tree, best_alpha, pruned_mse, mse_list[0]
def model_json(data_path, name_path, cart_max_depth):
    ##########################################################
    feature_names = get_Attribute(name_path)
    print "data_path=", data_path
    #------------------------------------------
    x_list, y_list = read_data_for_split(data_path, n=0,
                                         label=1)  #把数据和类别标签列分开。
    print "x_list=", x_list
    print "y_list=", y_list
    #------------------------------------------
    X_train, X_test, y_train, y_test = train_test_split(x_list,
                                                        y_list,
                                                        test_size=0.25,
                                                        random_state=0)
    print "X_train=", X_train
    #分别初始化对特征值和目标值的标准化器
    ss_X = StandardScaler()
    ss_y = StandardScaler()
    #训练数据都是数值型,所以要标准化处理
    X_train = np.array(X_train)
    print "X_train=", X_train
    X_train = ss_X.fit_transform(np.array(X_train))
    X_test = np.array(X_test)
    X_test = ss_X.transform(np.array(X_test))

    y_train = np.array(y_train)
    y_test = np.array(y_test)
    #目标数据(房价预测值)也是数值型,所以也要标准化处理
    #说明一下:fit_transform与transform都要求操作2D数据,而此时的y_train与y_test都是1D的,因此需要调用reshape(-1,1),例如:[1,2,3]变成[[1],[2],[3]]
    y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
    y_test = ss_y.transform(y_test.reshape(-1, 1))

    # print X_train
    feature_list = get_Attribute(name_path)

    dtr = DecisionTreeRegressor(max_depth=cart_max_depth,
                                criterion='mse',
                                random_state=0)
    print "now training,wait please.........."
    dtr.fit(X_train, y_train)
    print "train finished"
    class_names = ''  #因为是回归,所以不需要分类名
    result = rules(dtr, feature_list, class_names)
    print "result=", result
    with open('structure.json', 'w') as f:
        f.write(json.dumps(result))
    print "The json-style model has been stored in structure.json"

    print "now I'm drawing the CART Regression tree,wait please............"
    # print dir(data)
    dot_file = "./visualization/T0.dot"
    png_file = "./visualization/T0.svg"
    # draw_file(dtr,dot_file,png_file,X_train,feature_list)
    draw_file(dtr, dot_file, png_file, feature_list)
    print "CART tree has been drawn in " + png_file
    return dtr, result, X_train, y_train, X_test, y_test, feature_list
def model_gtmin_Tt(clf,model,feature_names,class_names,Tt_name):#T0->T1
    # print"model=",model
    Tt=Tt_count(model,0)#|Tt|
    # print"|Tt|=",Tt
    Rt=Rt_compute(model)
    # print"R(t)=",Rt

    RTt=RTt_compute(model,0)
    # print"R(Tt)=",RTt

    # leaves=nodes_all_count(model,0)#no use now
    # print"all nodes=",leaves
    gt_list=[]
    prune_parts=[]
    gt_with_tree(model,gt_list,prune_parts)

    # print gt_list
    # print"---------------------------------------------------------"
    # print len(gt_list)
    # print_list(prune_parts)
    # print"---------------------------------------------------------"
    # print len(prune_parts)
    alpha=min(gt_list)
    prune_gt_index=gt_list.index(alpha)
    # print"prune_gt_index=",prune_gt_index
    prune_for_minimum_gt=prune_parts[prune_gt_index]#
    # print"prune_for_minimum_gt=\n",prune_for_minimum_gt
#------------------------------
    T0=copy.deepcopy(model)
    T1=copy.deepcopy(model)#here T1 means Ti
    gt_list=[]#这里必须复位清零
    prune_parts=[]#这里必须复位清零
    T1_create(T1,gt_list,prune_parts,prune_gt_index)
    #这里不使用上面的prune_for_minimum的原因是,这个被裁掉的部分,你不知道处于哪个结点下面.
    #也就是说,你虽然知道要裁掉的子树是什么,但是你无法知道在哪里裁,所以这里对prune_parts进行重新构建
    #from T0(original model) to get T1
    #print"\nT0=",model
    #print "\nT1=",T1

    index=0#never change this value!!!
    sklearn_model=copy.deepcopy(clf)
    prune_sklearn_model(sklearn_model.tree_,index,T1)
    dot_file="./visualization/T"+Tt_name+".dot"
    png_file="./visualization/T"+Tt_name+".svg"
    # draw_file(sklearn_model,dot_file,png_file,X_train,feature_names)
    draw_file(sklearn_model,dot_file,png_file,feature_names)
    return sklearn_model,T1,alpha
def model_gtmin_Tt(dtr, model, feature_names, Tt_name):  #还没修改完成
    # print"model=",model
    Tt = Tt_count(model, 0)  #|Tt|
    # print"|Tt|=",Tt
    Rt = Rt_compute(model)
    # print"R(t)=",Rt

    RTt = RTt_compute(model, 0)
    # print"R(Tt)=",RTt

    # leaves=nodes_all_count(model,0)#no use now
    # print"all nodes=",leaves
    gt_list = []
    prune_parts = []
    gt_with_tree(model, gt_list, prune_parts)

    print "gt_list=", gt_list
    print "prune_parts=", prune_parts
    print "len(prune_parts)=", len(prune_parts)
    # print"---------------------------------------------------------"
    # print len(gt_list)
    # print_list(prune_parts)
    # print"---------------------------------------------------------"
    # print len(prune_parts)
    print "model=", model
    alpha = min(gt_list)
    prune_gt_index = gt_list.index(alpha)
    # print"prune_gt_index=",prune_gt_index
    prune_for_minimum_gt = prune_parts[prune_gt_index]
    # print"prune_for_minimum_gt=\n",prune_for_minimum_gt
    #------------------------------
    T0 = copy.deepcopy(model)
    T1 = copy.deepcopy(model)  #here T1 means Ti
    gt_list = []  #这里必须复位清零
    pruned_parts = []  #这里必须复位清零
    T1_create(T1, gt_list, pruned_parts, prune_gt_index)
    print "pruned_parts=", pruned_parts

    index = 0  #never change this value!!!
    sklearn_model = copy.deepcopy(dtr)
    prune_sklearn_model(sklearn_model.tree_, index, T1)
    dot_file = "./visualization/T" + Tt_name + ".dot"
    png_file = "./visualization/T" + Tt_name + ".svg"
    # draw_file(sklearn_model,dot_file,png_file,X_train,feature_names)
    draw_file(sklearn_model, dot_file, png_file, feature_names)
    return sklearn_model, T1, alpha