def run_knn(df_knn):
    print(
        "\n\n----------------------K Nearest Neighbors----------------------\n\n"
    )

    x = np.array(df_knn.ix[:, 0:])
    y = np.array(df_knn['class'])

    x_train, x_test, y_train, y_test = split_train_test(x, y)

    neighbors = [
        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37,
        39, 41, 43, 45, 47, 49
    ]
    cv_scores = []
    for k in neighbors:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn,
                                 x_train,
                                 y_train,
                                 cv=10,
                                 scoring='accuracy')
        cv_scores.append(scores.mean())

    MSE = [1 - x for x in cv_scores]
    optimal_k = neighbors[MSE.index(min(MSE))]
    print("The optimal number of neighbors is %d" % optimal_k)

    plot_data(neighbors, MSE)
def run_random_forest(df_knn):
    print("\n\n----------------------Random Forest----------------------\n\n")

    x = np.array(df_knn.ix[:, 0:])
    y = np.array(df_knn['class'])

    x_train, x_test, y_train, y_test = split_train_test(x,y)

    rf = RandomForestClassifier(random_state=1, n_estimators=250, min_samples_split=8, min_samples_leaf=4)
    rf.fit(x_train,y_train)
    pred = rf.predict(x_test)
    print("Accuracy: ",accuracy_score(y_test, pred))
def run_random_forest(df_knn):
    print("\n\n----------------------Random Forest----------------------\n\n")

    y = np.array(df_knn["class"])
    x = np.array(df_knn.drop(columns="class"))

    x_train, x_test, y_train, y_test = split_train_test(x, y)

    rf = RandomForestClassifier(random_state=1,
                                n_estimators=250,
                                min_samples_split=8,
                                min_samples_leaf=4)
    rf.fit(x_train, y_train)
    pred = rf.predict(x_test)
    test_pred = rf.predict(x_train)

    success_indices = np.where(y_train == 2)[0]
    test_frame = x_train[success_indices]
    print("location index:", success_indices)
    tst_y = y_train[success_indices]
    pred_frame = rf.predict(test_frame)
    print("\n\n----------------------Pred success----------------------\n\n",
          accuracy_score(tst_y, pred_frame))

    failure_indices = np.where(y_train == 1)[0]
    test_frame = x_train[failure_indices]
    tst_y = y_train[failure_indices]
    pred_frame = rf.predict(test_frame)
    print("\n\n----------------------Pred Failure----------------------\n\n",
          accuracy_score(tst_y, pred_frame))

    good_indices = np.where(y_train == 1)[0]
    test_frame = x_train[good_indices]
    tst_y = y_train[good_indices]
    pred_frame = rf.predict(test_frame)
    print("\n\n----------------------Pred good----------------------\n\n",
          accuracy_score(tst_y, pred_frame))

    # print the Training Accuracy
    print(
        "\n\n----------------------Training Set Accuracy----------------------\n\n",
        accuracy_score(y_train, test_pred),
    )
    print(
        "\n\n----------------------Testing Set Accuracy----------------------\n\n",
        accuracy_score(y_test, pred),
    )

    return rf
示例#4
0
def run_logistic_regression(df_knn):

    y = np.array(df_knn["class"])
    x = np.array(df_knn.drop(columns="class"))

    x_train, x_test, y_train, y_test = split_train_test(x, y)


    logistic = LogisticRegression()
    logistic.fit(x_train, y_train)
    
    pred_train = logistic.predict(x_train)
    print ("Training accuracy: ", (logistic.score(x_train, y_train) * 100))

    pred_test = logistic.predict(x_test)
    print ("Testing accuracy: ", (logistic.score(x_test, y_test) * 100))

    success_indices = np.where(y_train==2)[0]
    test_frame = x_train[success_indices]
    tst_y = y_train[success_indices]
    pred_frame = logistic.predict(test_frame)
    print(
        "\n\n----------------------Pred success----------------------\n\n",
        logistic.score(test_frame, tst_y)
    )
    

    failure_indices = np.where(y_train==1)[0]
    test_frame = x_train[failure_indices]
    tst_y = y_train[failure_indices]
    pred_frame = logistic.predict(test_frame)
    print(
        "\n\n----------------------Pred Failure----------------------\n\n",
        logistic.score(test_frame, tst_y)
    )
    
    good_indices = np.where(y_train==1)[0]
    test_frame = x_train[good_indices]
    tst_y = y_train[good_indices]
    pred_frame = logistic.predict(test_frame)
    print(
        "\n\n----------------------Pred good----------------------\n\n",
        logistic.score(test_frame, tst_y)
    )
    return logistic
def run_xgboost_imdb(df_knn):
    print(
        "\n\n----------------------XGBoost on IMDB dataset----------------------\n\n"
    )
    x = np.array(df_knn.ix[:, 0:])
    y = np.array(df_knn['class'])

    x_train, x_test, y_train, y_test = split_train_test(x, y)
    x_train = np.delete(x_train, [0, 1, 2, 3, 4, 5, 9, 44], axis=1)
    x_test = np.delete(x_test, [0, 1, 2, 3, 4, 5, 9, 44], axis=1)

    model = xgboost.XGBClassifier()
    model.fit(x_train, y_train)

    pred = model.predict(x_train)
    accuracy = accuracy_score(y_train, pred)
    print("Training accuracy: %.2f%%" % (accuracy * 100.0))

    pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    print("Testing accuracy: %.2f%%" % (accuracy * 100.0))
# !/usr/bin/python
# _*_ coding:utf-8 _*_
import numpy
print numpy.version.version

from split_dataset import split_train_test

split_train_test('file1.txt')