示例#1
0
def search_param(xTrain, yTrain, xTest, yTest):
    perf = pd.DataFrame()
    # do a grid search using OOB and set max trees to 250
    for crit in ['gini', 'entropy']:
        for mf in range(5,12):
            for md in range(1,6):
                for mls in range(5,8):
                    print(crit, mf, md, mls)
                    rf = RandomForest(250, mf,
                                      crit, md,
                                      mls)
                    oobErr = rf.train(xTrain, yTrain)
                    # use oob to keep track of stuff
                    tmpDF = pd.DataFrame.from_dict(oobErr,
                                                   orient='index',
                                                   columns=['err'])
                    tmpDF['nest'] = tmpDF.index
                    tmpDF['crit'] = crit
                    tmpDF['mf'] = mf
                    tmpDF['md'] = md
                    tmpDF['mls'] = mls
                    perf = pd.concat([perf, tmpDF])
    # clean up the indexing for the pandas dataframe
    perf = perf.reset_index(drop=True)
    return perf
def main():
    
    # set path
    iris_path = 'DataAnalysisProjectDesign/Experiment2/iris_train.arff'
    adult_path = 'DataAnalysisProjectDesign/Experiment2/adult_train.arff'

    # get choice
    data_choice = input('Enter 1 for iris; Enter 2 for adult:')

    dt_num = int(input('Enter your expected tree number:'))

    path = select_dataset(data_choice,iris_path,adult_path)

    # create data instance
    data_obj = Data(path)
    data_obj.load_data()
    data_obj.fill_missing_data()

    # create random forest
    rf = RandomForest(
        data=data_obj,
        dt_num=dt_num
    )

    rf.bagging()
    rf.train_rf()
    correct_rate,conf_mat = rf.test_rf()

    return dt_num,correct_rate,conf_mat
示例#3
0
文件: q2.py 项目: simonmarty/emory
import matplotlib.pyplot as plt
from rf import RandomForest, file_to_numpy

xTrain = file_to_numpy("q4xTrain.csv")
yTrain = file_to_numpy("q4yTrain.csv")
xTest = file_to_numpy("q4xTest.csv")
yTest = file_to_numpy("q4yTest.csv")

nests = range(1, 100)
results = []
for nest in nests:
    rf = RandomForest(nest=nest)
    res = rf.train(xTrain, yTrain)
    results.append(res[-1])

plt.xlabel("number of trees used")
plt.ylabel("OOB Error")
plt.plot(nests, results)
plt.savefig("nestcount")
plt.close()

rf = RandomForest(nest=300)
res = rf.train(xTrain, yTrain)

plt.plot(range(300), res)
plt.save("progress")
plt.close()

maxFeats = range(1, xTrain.shape[1])
results = []
for featset_length in maxFeats:
示例#4
0
def ensemble_diff_plot():
    fig, axes = plt.subplots(3, 3)
    fig.set_size_inches(10, 10)
    for ax in axes.flatten():
        n_ex = 100
        n_trees = 50
        n_feats = np.random.randint(2, 100)
        max_depth_d = np.random.randint(1, 100)
        max_depth_r = np.random.randint(1, 10)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            def loss(yp, y):
                return accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine = RandomForest(
                classifier=classifier,
                n_feats=n_feats,
                n_trees=n_trees,
                criterion=criterion,
                max_depth=max_depth_r,
            )
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)
            mine_g = GradientBoostedDecisionTree(
                n_iter=10,
                max_depth=max_depth_d,
                classifier=classifier,
                learning_rate=1,
                loss="crossentropy",
                step_size="constant",
                split_criterion=criterion,
            )

        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=1)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = RandomForest(
                criterion=criterion,
                n_feats=n_feats,
                n_trees=n_trees,
                max_depth=max_depth_r,
                classifier=classifier,
            )
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)
            mine_g = GradientBoostedDecisionTree(
                # n_trees=n_trees,
                n_iter=10,
                max_depth=max_depth_d,
                classifier=classifier,
                learning_rate=1,
                loss="mse",
                step_size="adaptive",
                split_criterion=criterion,
            )

        # fit 'em
        mine.fit(X, Y)
        mine_d.fit(X, Y)
        mine_g.fit(X, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_mine_test_d = mine_d.predict(X_test)
        y_pred_mine_test_g = mine_g.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)
        loss_mine_test_g = loss(y_pred_mine_test_g, Y_test)

        if classifier:
            entries = [("RF", loss_mine_test, y_pred_mine_test),
                       ("DT", loss_mine_test_d, y_pred_mine_test_d),
                       ("GB", loss_mine_test_g, y_pred_mine_test_g)]
            (lbl, test_loss, preds) = entries[np.random.randint(3)]
            ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100))
            for i in np.unique(Y_test):
                ax.scatter(
                    X_test[preds == i, 0].flatten(),
                    X_test[preds == i, 1].flatten(),
                    #  s=0.5,
                )
        else:
            X_ax = np.linspace(
                np.min(X_test.flatten()) - 1,
                np.max(X_test.flatten()) + 1, 100).reshape(-1, 1)
            y_pred_mine_test = mine.predict(X_ax)
            y_pred_mine_test_d = mine_d.predict(X_ax)
            y_pred_mine_test_g = mine_g.predict(X_ax)

            ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5)
            #  s=0.5)
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_g.flatten(),
                #  linewidth=0.5,
                label="GB".format(n_trees, n_feats, max_depth_d),
                color="red",
            )
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test.flatten(),
                #  linewidth=0.5,
                label="RF".format(n_trees, n_feats, max_depth_r),
                color="cornflowerblue",
            )
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_d.flatten(),
                #  linewidth=0.5,
                label="DT".format(max_depth_d),
                color="yellowgreen",
            )
            ax.set_title("GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format(
                loss_mine_test_g, loss_mine_test, loss_mine_test_d))
            ax.legend()
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])
    # plt.savefig("plot.png", dpi=300)
    plt.show()
    plt.close("all")
示例#5
0
def test_RandomForest():
    np.random.seed(12345)
    i = 1
    while True:
        n_ex = np.random.randint(2, 100)
        n_feats = np.random.randint(2, 100)
        n_trees = np.random.randint(2, 100)
        max_depth = np.random.randint(1, 5)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(
                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
            )
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            def loss(yp, y):
                return 1 - accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine = RandomForest(
                classifier=classifier,
                n_feats=n_feats,
                n_trees=n_trees,
                criterion=criterion,
                max_depth=max_depth,
            )
            gold = RandomForestClassifier(
                n_estimators=n_trees,
                max_features=n_feats,
                criterion=criterion,
                max_depth=max_depth,
                bootstrap=True,
            )
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = RandomForest(
                criterion=criterion,
                n_feats=n_feats,
                n_trees=n_trees,
                max_depth=max_depth,
                classifier=classifier,
            )
            gold = RandomForestRegressor(
                n_estimators=n_trees,
                max_features=n_feats,
                criterion=criterion,
                max_depth=max_depth,
                bootstrap=True,
            )

        print("Trial {}".format(i))
        print("\tClassifier={}, criterion={}".format(classifier, criterion))
        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
        if classifier:
            print("\tn_classes: {}".format(n_classes))

        # fit 'em
        mine.fit(X, Y)
        gold.fit(X, Y)

        # get preds
        y_pred_mine = mine.predict(X)
        y_pred_gold = gold.predict(X)

        loss_mine = loss(y_pred_mine, Y)
        loss_gold = loss(y_pred_gold, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_gold_test = gold.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_gold_test = loss(y_pred_gold_test, Y_test)

        try:
            np.testing.assert_almost_equal(loss_mine, loss_gold)
            print("\tLoss on training: {}".format(loss_mine))
        except AssertionError as e:
            print("\tTraining losses not equal:\n{}".format(e))

        try:
            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
            print("\tLoss on test: {}".format(loss_mine_test))
        except AssertionError as e:
            print("\tTest losses not equal:\n{}".format(e))

        print("PASSED")
        i += 1
示例#6
0
文件: tests.py 项目: yaohusama/ai
def test_RandomForest():
    np.random.seed(12345)
    i = 1
    while True:
        n_ex = np.random.randint(2, 100)
        n_feats = 50#np.random.randint(2, 100)
        n_trees = 20#np.random.randint(2, 100)
        max_depth = 3#np.random.randint(1, 5)

        classifier = True#np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(
                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
            )
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            def loss(yp, y):
                return 1 - accuracy_score(yp, y)

            # initialize model
            criterion = "gini"#np.random.choice(["entropy", "gini"])
            mine = RandomForest(
                classifier=classifier,
                n_feats=n_feats,
                n_trees=n_trees,
                criterion=criterion,
                max_depth=max_depth,
            )
            #n_estimators:在利用最大投票数或平均值来预测之前,你想要建立子树的数量。 较多的子树可以让模型有更好的性能,但同时让你的代码变慢。 你应该选择尽可能高的值,只要你的处理器能够承受的住,因为这使你的预测更好更稳定。
            #max_features:随机森林允许单个决策树使用特征的最大数量。 Python为最大特征数提供了多个可选项。 下面是其中的几个:
                #Auto/None :简单地选取所有特征,每颗树都可以利用他们。这种情况下,每颗树都没有任何的限制。
                #sqrt :此选项是每颗子树可以利用总特征数的平方根个。 例如,如果变量(特征)的总数是100,所以每颗子树只能取其中的10个。“log2”是另一种相似类型的选项。
                #0.2:此选项允许每个随机森林的子树可以利用变量(特征)数的20%。如果想考察的特征x%的作用, 我们可以使用“0.X”的格式。            
            gold = RandomForestClassifier(
                n_estimators=n_trees,
                max_features=n_feats,
                criterion=criterion,
                max_depth=max_depth,
                bootstrap=True,
            )
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = RandomForest(
                criterion=criterion,
                n_feats=n_feats,
                n_trees=n_trees,
                max_depth=max_depth,
                classifier=classifier,
            )
            gold = RandomForestRegressor(
                n_estimators=n_trees,
                max_features=n_feats,
                criterion=criterion,
                max_depth=max_depth,
                bootstrap=True,
            )

        print("Trial {}".format(i))
        print("\tClassifier={}, criterion={}".format(classifier, criterion))
        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
        if classifier:
            print("\tn_classes: {}".format(n_classes))

        # fit 'em
        mine.fit(X, Y)
        gold.fit(X, Y)

        # get preds
        y_pred_mine = mine.predict(X)
        y_pred_gold = gold.predict(X)

        loss_mine = loss(y_pred_mine, Y)
        loss_gold = loss(y_pred_gold, Y)
        print(f"train data mine acc: {1-loss_mine}")
        print(f"train data sklearn acc: {1-loss_gold}")
        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_gold_test = gold.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_gold_test = loss(y_pred_gold_test, Y_test)
        print(f"test data mine acc: {1-loss_mine_test}")
        print(f"test data sklearn acc: {1-loss_gold_test}")
        #try:
            #np.testing.assert_almost_equal(loss_mine, loss_gold)
            #print("\tLoss on training: {}".format(loss_mine))
        #except AssertionError as e:
            #print("\tTraining losses not equal:\n{}".format(e))

        #try:
            #np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
            #print("\tLoss on test: {}".format(loss_mine_test))
        #except AssertionError as e:
            #print("\tTest losses not equal:\n{}".format(e))

        print("PASSED")
        i += 1
示例#7
0
def eval_opt(xTrain, yTrain, xTest, yTest):
    bst = RandomForest(47, 7, 'gini', 4, 5)
    ypred = bst.predict(xTest)
    # evaluate predictions
    print(1-skm.accuracy_score(yTest, ypred))