예제 #1
0
def train_predict_knn(k_list, X_train_norm, X_test_norm, train_scores,
                      test_scores):
    # initialize base detectors
    clf_list = []
    for k in k_list:
        clf = Knn(n_neighbors=k, method='largest')
        clf.fit(X_train_norm)
        train_score = clf.decision_scores
        test_score = clf.decision_function(X_test_norm)
        clf_name = 'knn_' + str(k)

        clf_list.append(clf_name)
        curr_ind = len(clf_list) - 1

        train_scores[:, curr_ind] = train_score.ravel()
        test_scores[:, curr_ind] = test_score.ravel()

    return train_scores, test_scores
예제 #2
0
        # standardizing data for processing
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        # initialize 20 base detectors for combination
        k_list = [
            10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
            160, 170, 180, 190, 200
        ]

        train_scores = np.zeros([X_train.shape[0], n_clf])
        test_scores = np.zeros([X_test.shape[0], n_clf])

        for i in range(n_clf):
            k = k_list[i]

            clf = Knn(n_neighbors=k, method='largest')
            clf.fit(X_train_norm)

            train_scores[:, i] = clf.decision_scores.ravel()
            test_scores[:, i] = clf.decision_function(X_test_norm).ravel()

        # scores have to be normalized before combination
        train_scores_norm, test_scores_norm = standardizer(
            train_scores, test_scores)

        # combination by mean
        comb_by_mean = np.mean(test_scores_norm, axis=1)
        roc_mean.append(roc_auc_score(y_test, comb_by_mean))
        prn_mean.append(precision_n_scores(y_test, comb_by_mean))
        print('ite', t + 1, 'comb by mean,', 'ROC:',
              roc_auc_score(y_test, comb_by_mean), 'precision@n:',
예제 #3
0
from sklearn.metrics import roc_auc_score

from data.load_data import generate_data
from models.knn import Knn
from utility.utility import precision_n_scores

if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 1000
    n_test = 500

    X_train, y_train, c_train, X_test, y_test, c_test = generate_data(
        n=n_train, contamination=contamination, n_test=n_test)

    # train a kNN detector (default version)
    clf = Knn(n_neighbors=10, contamination=contamination, method='largest')
    clf.fit(X_train)

    # get the prediction on the training data
    y_train_pred = clf.y_pred
    y_train_score = clf.decision_scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.decision_function(X_test)

    print('Train ROC:{roc}, precision@n:{prn}'.format(
        roc=roc_auc_score(y_train, y_train_score),
        prn=precision_n_scores(y_train, y_train_score)))

    print('Test ROC:{roc}, precision@n:{prn}'.format(
예제 #4
0
        # standardizing data for processing
        scaler = StandardScaler().fit(X_train)
        X_train_norm = scaler.transform(X_train)
        X_test_norm = scaler.transform(X_test)

        k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
                  150, 160, 170, 180, 190, 200]

        train_scores = np.zeros([X_train.shape[0], n_clf])
        test_scores = np.zeros([X_test.shape[0], n_clf])

        for i in range(n_clf):
            k = k_list[i]

            clf = Knn(n_neighbors=k, method='largest')
            clf.fit(X_train_norm)

            train_scores[:, i] = clf.decision_scores.ravel()
            test_scores[:, i] = clf.decision_function(X_test_norm).ravel()
            # print(k, roc_auc_score(y_test, test_scores[:, i]))

        # scores have to be normalized before combination
        scaler = StandardScaler().fit(train_scores)
        train_scores_norm = scaler.transform(train_scores)
        test_scores_norm = scaler.transform(test_scores)

        mean_result = np.mean(test_scores_norm, axis=1)
        roc_mean.append(roc_auc_score(y_test, mean_result))
        print('ite', t, 'mean', roc_auc_score(y_test, mean_result))
예제 #5
0
# print(textds.embedding_from_text(['I love coffee']).shape)

# %%
knn_ds = {
    'hamming': ((X_train > 0).astype('float'), (X_valid > 0).astype('float')),
    'euclidean': (X_train, X_valid),
    'cosine': (tf_idf(X_train, alpha=1e-6,
                      beta=1e-9), tf_idf(X_valid, alpha=1e-6, beta=1e-9)),
}

best_acc = 0
best_metric = None
best_k = 0
for metric, (X_train_, X_valid_) in knn_ds.items():
    for k in [1, 3, 5]:
        clf = TextClassifier(Knn(n_neighbors=k, metric=metric))
        clf.fit(X_train_, y_train)
        acc = clf.score(X_valid_, y_valid)
        print(metric, k, round(acc * 100, 2), sep=', ')

        if acc > best_acc:
            best_acc = acc
            best_metric = metric
            best_k = k
print(best_metric, best_k, round(best_acc * 100, 2), sep=', ')

# %%
from sklearn.neighbors import KNeighborsClassifier

clf = TextClassifier(KNeighborsClassifier(n_neighbors=5, metric='cosine'))
clf.fit(tf_idf(X_train, alpha=1e-6, beta=1e-9), y_train)