예제 #1
0
def run_mlp_neural_network(train, test, ss_split, labels):

    # prepare training and test data
    X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split,
                                                    labels)

    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    scaler = StandardScaler().fit(X_test)
    X_test_scaled = scaler.transform(X_test)

    print('ML Model: MLP Neural Network')
    model = MLPClassifier(hidden_layer_sizes=(150, ),
                          activation='logistic',
                          solver='lbfgs',
                          alpha=0.001,
                          max_iter=200,
                          early_stopping=True,
                          validation_fraction=0.2,
                          learning_rate='adaptive',
                          tol=1e-8,
                          random_state=1).fit(X_train_scaled, y_train)
    # Accuracy
    train_predictions = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, train_predictions)
    # Logloss
    train_predictions_p = model.predict_proba(X_test_scaled)
    ll = log_loss(y_test, train_predictions_p)

    scaler = StandardScaler().fit(test)
    test_scaled = scaler.transform(test)
    test_predictions = model.predict_proba(test_scaled)
    return test_predictions, acc, ll
예제 #2
0
def run_k_nearest_neighbours(train, test, ss_split, labels):
    # prepare training and test data
    X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split,
                                                    labels)

    clf = KNeighborsClassifier(3)  # Instantiate a classifier
    clf.fit(X_train, y_train)  # Fit this classifier to the data
    print('ML Model: K-Nearest Neighbours')

    # Cross-validation
    scores = cross_val_score(KNeighborsClassifier(3),
                             train.values,
                             labels,
                             cv=ss_split)
    #print 'Mean Cross-validation scores: {}'.format(np.mean(scores))

    # Accuracy
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    # Logloss
    train_predictions_p = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_p)

    test_predictions = clf.predict_proba(test)
    return test_predictions, acc, ll
예제 #3
0
def run_linear_discriminant_analysis(train, test, ss_split, labels):
    # prepare training and test data
    X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split,
                                                    labels)

    clf = LinearDiscriminantAnalysis().fit(X_train, y_train)
    print('ML Model: Linear Discriminant Analysis')

    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)

    train_predictions_p = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_p)

    test_predictions = clf.predict_proba(test)
    return test_predictions, acc, ll
예제 #4
0
def run_random_forest(train, test, ss_split, labels):
    # prepare training and test data
    X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split,
                                                    labels)

    clf = RandomForestClassifier().fit(X_train, y_train)
    print('ML Model: Random Forest')

    # Accuracy
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    # Logloss
    train_predictions_p = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_p)

    test_predictions = clf.predict_proba(test)
    return test_predictions, acc, ll
예제 #5
0
def run_support_vector_machine(train, test, ss_split, labels):
    # prepare training and test data
    X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split,
                                                    labels)
    clf = SVC(probability=True)

    # Gird search
    #param_grid = {'C': [1, 10, 100, 1000, 10000, 100000],
    #              'gamma': [1, 10, 100, 1000, 10000, 100000]}
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'gamma': [0.001, 0.01, 0.1, 1, 10, 100]
    }

    grid_search = GridSearchCV(SVC(probability=True),
                               param_grid=param_grid,
                               cv=ss_split)
    grid_search.fit(X_train, y_train)

    print('Best parameter: {}'.format(grid_search.best_params_))
    print('Best cross-validation accuracy score: {}'.format(
        grid_search.best_score_))
    print('\nBest estimator:\n{}'.format(grid_search.best_estimator_))
    # results = pd.DataFrame(grid_search.cv_results_)
    # Show the first 5 rows of the result
    #print results.head()

    # scores = np.array(results.mean_test_score).reshape(6, 6)
    #
    # ax = sns.heatmap(scores, annot=True, fmt=".2f",linewidths=.5);
    # ax.invert_yaxis()
    # ax.set(xticklabels=param_grid['gamma']); ax.set(yticklabels=param_grid['C'])
    # plt.yticks(rotation=0)
    # plt.xlabel('gamma'); plt.ylabel('C'); plt.show()

    print('ML Model: Suppoort Vector Machine')
    # Accuracy
    train_predictions = grid_search.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    # Logloss
    train_predictions_p = grid_search.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_p)

    test_predictions = grid_search.predict_proba(test)
    return test_predictions, acc, ll
예제 #6
0
def run_logistic_regression(train, test, ss_split, labels):
    # prepare training and test data
    X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split,
                                                    labels)

    #param_grid = {'C':[1, 10],
    #              'tol': [0.001, 0.0001]}

    # Standardize the training data.
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    scaler = StandardScaler().fit(X_test)
    X_test_scaled = scaler.transform(X_test)

    param_grid = {'C': [1000, 10000], 'tol': [0.000001, 0.00001]}
    log_reg = LogisticRegression(solver='newton-cg', multi_class='multinomial')
    grid_search = GridSearchCV(log_reg,
                               param_grid,
                               scoring='neg_log_loss',
                               refit='True',
                               n_jobs=1,
                               cv=ss_split)
    grid_search.fit(X_train_scaled, y_train)

    print('Best parameter: {}'.format(grid_search.best_params_))
    print('Best cross-validation neg_log_loss score: {}'.format(
        grid_search.best_score_))
    print('\nBest estimator:\n{}'.format(grid_search.best_estimator_))
    print('ML Model: Logistic Regression')
    # Accuracy
    train_predictions = grid_search.predict(X_test_scaled)
    acc = accuracy_score(y_test, train_predictions)
    # Logloss
    train_predictions_p = grid_search.predict_proba(X_test_scaled)
    ll = log_loss(y_test, train_predictions_p)

    scaler = StandardScaler().fit(test)
    test_scaled = scaler.transform(test)
    test_predictions = grid_search.predict_proba(test_scaled)

    # visualize error
    # hpr.visualize_error(train_predictions, y_test)

    return test_predictions, acc, ll
예제 #7
0
def run_naive_bayes(train, test, ss_split, labels):
    # prepare training and test data
    X_train, X_test, y_train, y_test = hpr.prepData(train, test, ss_split,
                                                    labels)

    clf = GaussianNB().fit(
        X_train, y_train
    )  # Instantiate a classifier and fit this classifier to the data
    print('ML Model: Naive Bayes')
    # Cross-validation
    scores = cross_val_score(GaussianNB(), train.values, labels, cv=ss_split)
    print('Mean Cross-validation scores: {}'.format(np.mean(scores)))
    # Accuracy
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    # Logloss
    train_predictions_p = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_p)

    test_predictions = clf.predict_proba(test)
    return test_predictions, acc, ll