def train_predict_knn(k_list, X_train_norm, X_test_norm, train_scores, test_scores): # initialize base detectors clf_list = [] for k in k_list: clf = Knn(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_score = clf.decision_scores test_score = clf.decision_function(X_test_norm) clf_name = 'knn_' + str(k) clf_list.append(clf_name) curr_ind = len(clf_list) - 1 train_scores[:, curr_ind] = train_score.ravel() test_scores[:, curr_ind] = test_score.ravel() return train_scores, test_scores
# standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) # initialize 20 base detectors for combination k_list = [ 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200 ] train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) for i in range(n_clf): k = k_list[i] clf = Knn(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores.ravel() test_scores[:, i] = clf.decision_function(X_test_norm).ravel() # scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer( train_scores, test_scores) # combination by mean comb_by_mean = np.mean(test_scores_norm, axis=1) roc_mean.append(roc_auc_score(y_test, comb_by_mean)) prn_mean.append(precision_n_scores(y_test, comb_by_mean)) print('ite', t + 1, 'comb by mean,', 'ROC:', roc_auc_score(y_test, comb_by_mean), 'precision@n:',
from sklearn.metrics import roc_auc_score from data.load_data import generate_data from models.knn import Knn from utility.utility import precision_n_scores if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 1000 n_test = 500 X_train, y_train, c_train, X_test, y_test, c_test = generate_data( n=n_train, contamination=contamination, n_test=n_test) # train a kNN detector (default version) clf = Knn(n_neighbors=10, contamination=contamination, method='largest') clf.fit(X_train) # get the prediction on the training data y_train_pred = clf.y_pred y_train_score = clf.decision_scores # get the prediction on the test data y_test_pred = clf.predict(X_test) y_test_score = clf.decision_function(X_test) print('Train ROC:{roc}, precision@n:{prn}'.format( roc=roc_auc_score(y_train, y_train_score), prn=precision_n_scores(y_train, y_train_score))) print('Test ROC:{roc}, precision@n:{prn}'.format(
# standardizing data for processing scaler = StandardScaler().fit(X_train) X_train_norm = scaler.transform(X_train) X_test_norm = scaler.transform(X_test) k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200] train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) for i in range(n_clf): k = k_list[i] clf = Knn(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores.ravel() test_scores[:, i] = clf.decision_function(X_test_norm).ravel() # print(k, roc_auc_score(y_test, test_scores[:, i])) # scores have to be normalized before combination scaler = StandardScaler().fit(train_scores) train_scores_norm = scaler.transform(train_scores) test_scores_norm = scaler.transform(test_scores) mean_result = np.mean(test_scores_norm, axis=1) roc_mean.append(roc_auc_score(y_test, mean_result)) print('ite', t, 'mean', roc_auc_score(y_test, mean_result))
# print(textds.embedding_from_text(['I love coffee']).shape) # %% knn_ds = { 'hamming': ((X_train > 0).astype('float'), (X_valid > 0).astype('float')), 'euclidean': (X_train, X_valid), 'cosine': (tf_idf(X_train, alpha=1e-6, beta=1e-9), tf_idf(X_valid, alpha=1e-6, beta=1e-9)), } best_acc = 0 best_metric = None best_k = 0 for metric, (X_train_, X_valid_) in knn_ds.items(): for k in [1, 3, 5]: clf = TextClassifier(Knn(n_neighbors=k, metric=metric)) clf.fit(X_train_, y_train) acc = clf.score(X_valid_, y_valid) print(metric, k, round(acc * 100, 2), sep=', ') if acc > best_acc: best_acc = acc best_metric = metric best_k = k print(best_metric, best_k, round(best_acc * 100, 2), sep=', ') # %% from sklearn.neighbors import KNeighborsClassifier clf = TextClassifier(KNeighborsClassifier(n_neighbors=5, metric='cosine')) clf.fit(tf_idf(X_train, alpha=1e-6, beta=1e-9), y_train)