예제 #1
0
class TestKnnMedian(unittest.TestCase):

    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')

    def test_fit(self):
        self.clf.fit(self.X_train)

    def test_decision_function(self):
        self.clf.fit(self.X_train)
        self.clf.decision_function(self.X_train)
        self.clf.decision_function(self.X_test)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def tearDown(self):
        pass
예제 #2
0
class TestKnnMedian(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')

    def test_fit(self):
        self.clf.fit(self.X_train)

    def test_decision_function(self):
        self.clf.fit(self.X_train)
        self.clf.decision_function(self.X_train)
        self.clf.decision_function(self.X_test)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def tearDown(self):
        pass
예제 #3
0
class TestKnnMedian(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')

    def test_fit(self):
        self.clf.fit(self.X_train)

    def test_decision_function(self):
        self.clf.fit(self.X_train)
        self.clf.decision_function(self.X_train)
        self.clf.decision_function(self.X_test)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
예제 #4
0
def knn(X_train, y_train=None, X_test=None, y_test=None):
    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores
    # # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    #
    # # evaluate and print the results
    # print("\nOn Training Data:")
    # evaluate_print(clf_name, y_train, y_train_scores)
    # print("\nOn Test Data:")
    # evaluate_print(clf_name, y_test, y_test_scores)
    #
    # visualize the results
    visualize(clf_name,
              X_train,
              X_test,
              y_train_pred,
              y_test_pred,
              show_figure=True,
              save_figure=False)

    return y_train_pred, y_train_scores
예제 #5
0
파일: views.py 프로젝트: richuln6/MASK-D
def pyodtry():
    dfwhole = df_en_all
    df = dff2
    X1 = reduce(dfwhole)
    X2 = reduce(df)
    ddf = pd.read_pickle('LogFileDfs/original')

    random_state = np.random.RandomState(42)
    outliers_fraction = 0.005
    clf = KNN(method='mean', contamination=outliers_fraction)
    xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200))

    clf.fit(X1)
    scores_pred = clf.decision_function(X2) * -1
    y_pred = clf.predict(X2)
    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)
    print('OUTLIERS : ', n_outliers, 'INLIERS : ', n_inliers)
    #dfx = pdf
    #dfx['outlier'] = y_pred.tolist()
    df['authenticated?'] = y_pred.tolist()
    ddf['authenticated?'] = df['authenticated?']
    output = ddf[ddf['authenticated?'] == 1]
    # create sqlalchemy engine
    #engine = create_engine("mysql+pymysql://{user}:{pw}@172.17.0.3/{db}".format(user="******",pw="richul123",db="emss"))
    # Insert whole DataFrame into  MySQL
    #output.to_sql('output', con = engine, if_exists = 'replace', chunksize = 1000)
    with pd.ExcelWriter(
            '/home/richul/Documents/EnhancingMailServerSecurity/Output/output.xlsx'
    ) as writer:
        output.to_excel(writer, sheet_name='output')
예제 #6
0
class IForestSupervisedKNN(BaseDetector):
    def __init__(self, get_top=0.8, if_params={}, knn_params={}):
        super(IForestSupervisedKNN, self).__init__()
        self.get_top = get_top
        self.is_fitted = False

        self.iforest = IForest(**if_params)

        self.knn = KNN(**knn_params)

    def fit(self, X, y=None):

        X = check_array(X)
        self._set_n_classes(y)

        self.iforest.fit(X)

        scores = self.iforest.predict_proba(X)[:, 1]

        normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]]

        self.knn.fit(normal_instances)

        self.decision_scores_ = self.decision_function(X)
        self._process_decision_scores()

        self.is_fitted = True

        return self

    def decision_function(self, X):

        check_is_fitted(self, ['is_fitted'])

        return self.knn.decision_function(X)
예제 #7
0
def obj_func_kNN(params):
    ## objective function used in baseian optimization
    outlier_fraction = params[0]
    n_neighbors = params[1]
    method = params[2]
    radius = params[3]

    # load data set to function work space
    Y_train = np.load('Y_train.npy')
    X_train = np.load('X_train.npy')

    # create model
    clf = KNN(contamination=outlier_fraction,
              n_neighbors=n_neighbors,
              method=method,
              radius=radius)
    # fit the dataset to the model
    clf.fit(X_train)

    scores_pred = clf.decision_function(
        X_train) * -1  # predict raw anomaly score
    Rprecision = Rprecision_f(Y_train, scores_pred)
    if glb_verbose:
        print('R Precision : ', Rprecision)

    y_pred = clf.predict(
        X_train)  # prediction of a datapoint category outlier or inlier
    objVal = objVal_f(Rprecision, y_pred, Y_train)

    return objVal
예제 #8
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
예제 #9
0
  def detectarOutlierKNN(self, idmodelo, Xtodos, corteOutlier):
    # Detecao Outliers 1--------------------------------------------------------------
    clf = KNN()
    clf.fit(Xtodos)

    # get outlier scores
    y_train_scores = clf.decision_scores_  # raw outlier scores
    y_test_scores = clf.decision_function(Xtodos)  # outlier scores

    YCodigoTodosComOutilier = self.selectMatrizY(idmodelo, "ID", "TODOS")

    cont = 0
    amostrasRemovidas = 0

    for itemOutilier in y_train_scores:
      if itemOutilier > corteOutlier:
        contTodos = 0
        for item in YCodigoTodosComOutilier:
          amostra = str(item)
          amostra = amostra.replace("[", "")
          amostra = amostra.replace("]", "")
          if contTodos == cont:
            db.execute(
              " update amostra set tpamostra = 'OUTLIER' where idamostra = " + str(amostra) + " and idmodelo = " + str(
                idmodelo) + "")
            print(itemOutilier)
            amostrasRemovidas = amostrasRemovidas + 1
            break
          contTodos = contTodos + 1
      cont = cont + 1

    session.commit()
    print("Numero de Amostras Removidas: " + str(amostrasRemovidas))
    return cont
예제 #10
0
def api_alert(influxdb_ip, influxdb_port, influxdb_user, influxdb_pwd,
              influxdb_database, influxdb_table, apiid):

    timelimit = 'time > now()-1d'
    # 访问influxdb
    client = InfluxDBClient(influxdb_ip, influxdb_port, influxdb_user,
                            influxdb_pwd, influxdb_database)
    # 获取当前API一天前的数据
    result = client.query('select Average, CallCount, ErrorRate from ' +
                          influxdb_table + ' where ApiId = \'' + apiid +
                          '\' and ' + timelimit + ';')
    # 把resultset格式的数据转换成list格式
    apis_table = list(result.get_points(measurement='apis'))
    # 把要处理的数据存成DataFrame
    df = pd.DataFrame(data=apis_table)
    # 去掉不参与运算的列,取训练集x
    x = df
    x = x.drop("time", axis=1)
    # 数据处理一下,归一化,映射到[0,1]
    x['CallCount'] = (x['CallCount']-x['CallCount'].min()) / \
        (x['CallCount'].max()-x['CallCount'].min())
    x['Average'] = (x['Average']-x['Average'].min()) / \
        (x['Average'].max()-x['Average'].min())
    x['ErrorRate'] = x['ErrorRate'] / 100
    # 取最后十秒的数据点作为测试点
    x_last = x.tail(1)
    #df_last = df.tail(1)
    x = x.drop(x.index[-1])
    df = df.drop(df.index[-1])
    # 转换成numpy格式准备计算
    x = x.values

    # 训练一个kNN检测器
    clf_name = 'kNN'
    clf = KNN()  # 初始化检测器clf
    clf.fit(x)  # 使用X_train训练检测器clf

    # 给df添加一列显示异常分数
    df['score'] = clf.decision_scores_

    # 排序分数
    df = df.sort_values("score", ascending=False)
    #print(df.head(20))

    # 新数据预测
    test_data = x_last
    test_scores = clf.decision_function(test_data)

    if (test_scores > 0.8):
        print('数据点异常程度4,必须报警')
    elif (test_scores > 0.5):
        print('数据点异常程度3,需要报警')
    elif (test_scores > 0.1):
        print('数据点异常程度2,建议报警')
    elif (test_scores > 0.05):
        print('数据点异常程度1,可以报警')
        #这个分级是根据KNN.py的图像分析出来的,0.05以上的很明显是异常点,0.1以上已经出现了离群现象,0.5以上就距离数据点很远了。
        #这个值根据训练用的时间相关,一天的数据0.05比较合适。
    return test_scores
예제 #11
0
def model_test(model_type, y_train, y_test, X_train, X_test, model_file,
               save_flag):
    if model_type == 'KNN':
        clf_name = 'KNN'
        clf = KNN()
        clf.fit(X_train)
    if model_type == 'XGBOD':
        clf_name = 'XGBOD'
        #set this scale_pos_weight  sum(negative instances) / sum(positive instances).
        clf = XGBOD(random_state=42, scale_pos_weight=50)
        clf.fit(X_train, y_train)
    if model_type == 'SOD':
        # train SOD detector
        # Note that SOD is meant to work in high dimensions d > 2.
        # But here we are using 2D for visualization purpose
        # thus, higher precision is expected in higher dimensions
        clf_name = 'SOD'
        clf = SOD()
        clf.fit(X_train)
    if model_type == 'VAE':
        # train VAE detector (Beta-VAE)
        clf_name = 'VAE'
        contamination = 0.01
        clf = VAE(epochs=30,
                  contamination=contamination,
                  gamma=0.8,
                  capacity=0.2)
        clf.fit(X_train)

    #save model if specified
    if save_flag == '1':
        pickle.dump(clf, open(model_file, "wb"))

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    conf_train = confusion_matrix(y_train, y_train_pred)
    print("<<<< confusion matrix for train: ", conf_train)

    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
    conf_test = confusion_matrix(y_test, y_test_pred)
    print("<<<< confusion matrix for test: ", conf_test)

    # visualize the results
    #todo: Input data has to be 2-d for visualization.
    #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
    #         y_test_pred, show_figure=True, save_figure=False)

    return model_file
예제 #12
0
def detect_anomaly(df):
	clf = KNN()
	x_values = df.change.values.reshape(df.index.values.shape[0],1)
	y_values = df.change.values.reshape(df.change.values.shape[0],1)
	clf.fit(y_values)
	clf.predict(y_values)
	df["out_label"] = clf.predict(y_values)  #fit_predict_score
	df["out_score"] = clf.decision_function(y_values)
	return df
예제 #13
0
def detect_anomaly(df):
	x_values = df.index.values.reshape(df.index.values.shape[0],1)
	y_values = df.change.values.reshape(df.change.values.shape[0],1)
	clf = KNN()
	clf.fit(y_values)
	clf.predict(y_values)
	df["label_knn"] = clf.predict(y_values)
	df["score_knn"] = clf.decision_function(y_values).round(4)
	return df
예제 #14
0
def abnormal_KNN(train_npy, test_npy):
    clf_name = 'kNN'
    clf = KNN()
    train_npy = np.array(train_npy).reshape(-1, 1)
    clf.fit(train_npy)

    test_npy = np.array(test_npy).reshape(-1, 1)
    y_test_pred = clf.predict(test_npy)
    y_test_scores = clf.decision_function(test_npy)
    return y_test_pred
예제 #15
0
class KNNPredictionHead:
    def __init__(self, n_neighbors=5, method="largest", **kwargs):

        self.model = KNN(n_neighbors=n_neighbors, method=method)

    def fit(self, X):

        return self.model.fit(X.detach().cpu().numpy())

    def decision_function(self, X):

        return self.model.decision_function(X.detach().cpu().numpy())
예제 #16
0
    def get_KNN_scores(dataframe,
                       cols,
                       outliers_fraction=0.01,
                       standardize=True):
        '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default
        
        Returns:
            df with KNN scores added         
        '''
        if standardize:
            #standardize selected variables
            minmax = MinMaxScaler(feature_range=(0, 1))
            dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        #fit
        clf = KNN(contamination=outliers_fraction)
        clf.fit(X)

        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        CheckOutliers.df4 = dataframe
        CheckOutliers.df4['outlier'] = y_pred.tolist()

        print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with KNN')
# In[169]:


# 训练一个kNN检测器
clf_name = 'kNN'
clf = KNN() # 初始化检测器
clf.fit(new_origin_all[:pos]) # 使用训练集训练检测器clf

# 返回训练数据X_train上的异常标签和异常分值
y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常)

# 用训练好的clf来预测未知数据中的异常值
y_test_pred = clf.predict(new_origin_all[pos:]) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) 
y_test_scores = clf.decision_function(new_origin_all[pos:]) # 返回未知数据上的异常值

show_scatter(clf_name, df, y_train_pred, pos)


# In[170]:


clf_name = 'COF'
clf = COF(n_neighbors=30)
clf.fit(new_origin_all[:pos])

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores
clf.fit(trainData)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
#print(y_train_pred)
y_train_scores = clf.decision_scores_  # raw outlier scores
#print(y_train_scores)

# get the prediction on the test data
y_test_pred = clf.predict(testData)  #X_test)  # outlier labels (0 or 1)
print(y_test_pred)

from sklearn.metrics import accuracy_score

accuracy_percentage = accuracy_score(testTarget, y_test_pred) * 100
print("The prediction accuracy is:", end=" ")
print(accuracy_percentage)

y_test_scores = clf.decision_function(testData)  #X_test)  # outlier scores
#print(y_test_scores)

# evaluate and print the results
#print("\nOn Training Data:")
#evaluate_print(clf_name, y_train, y_train_scores)
#print("\nOn Test Data:")
#evaluate_print(clf_name, y_test, y_test_scores)

#isualize(clf_name, trainData, trainTarget, testData, testTarget, y_train_pred,
#          y_test_pred, show_figure=True, save_figure=False)
예제 #19
0
df['score'] = clf.decision_scores_
#df['pred'] = clf.labels_
df = df.sort_values("score", ascending=False)
print(df.head(10))
print(df['CallCount'].max())
max_score = df.head(1)['score'].values
print(max_score)
df['prob'] = df['score'] / max_score
print(df.head(10))
'''
# 用训练好的clf来预测未知数据中的异常值
y_test_pred = clf.predict(X_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
y_test_scores = clf.decision_function(X_test)  #  返回未知数据上的异常值 (分值越大越异常)'''

# 新数据预测
testdata = {"Average": 150, "CallCount": 130, 'ErrorRate': 100}
test = pd.DataFrame(data=testdata, index=[0])
print(test)
test_scores = clf.decision_function(test)
print(test_scores)

alert = pd.DataFrame(df.loc[df['score'] > 10])
# 数据展示
plt.title(timelimit)
plt.xlabel('Average')
plt.ylabel('CallCount')
plt.scatter('Average', 'CallCount', s=1, data=df)
plt.scatter('Average', 'CallCount', s=2, c='red', data=alert)
#plt.scatter('Average', 'CallCount', s=3, c='green', data=test)
plt.show()
예제 #20
0
                y_item = np.array(y_item)
                contam /= len(y_item)
                contam = min(0.5, contam)
                # 定义模型
                classifiers = {
                    'KNN': KNN(contamination=contam),
                    'LOF': LOF(contamination=contam),
                    'PCA': PCA(contamination=contam),
                    'LODA': LODA(contamination=contam)
                }
                for cls in classifiers:
                    clf = classifiers[cls]
                    t0 = time.time()
                    x_item = standardizer(x_item)
                    clf.fit(x_item)
                    y_scores = clf.decision_function(x_item)
                    t1 = time.time()
                    duration = round(t1 - t0, ndigits=4)

                    roc = round(roc_auc_score(y_item, y_scores), ndigits=4)
                    prn = round(precision_n_scores(y_item, y_scores),
                                ndigits=4)
                    results[cls].append(roc)

                    print(
                        'benchmark id:{bench_id}, model:{clf_name}, ROC:{roc}, precision @ rank n:{prn}, '
                        'execution time: {duration}s'.format(
                            bench_id=bench_id,
                            clf_name=cls,
                            roc=roc,
                            prn=prn,
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

        clf_name = 'KNN'
        clf = KNN()  # 初始化检测器clf
        clf.fit(X_train)  # 使用X_train训练检测器clf

        # 返回训练数据X_train上的异常标签和异常分值
        y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
        y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
        print("On train Data:")
        evaluate_print(clf_name, y_train, y_train_scores)

        # 用训练好的clf来预测未知数据中的异常值
        y_test_pred = clf.predict(X_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
        y_test_scores = clf.decision_function(X_test)  # 返回未知数据上的异常值 (分值越大越异常)
        print("On Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        y_true = column_or_1d(y_test)
        y_pred = column_or_1d(y_test_scores)
        check_consistent_length(y_true, y_pred)

        roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
        prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)
        knn_roc.append(roc)
        knn_prn.append(prn)


        clf_name = 'LOF'
        clf = LOF()  # 初始化检测器clf
예제 #22
0
    plt.ylabel('F2')
    plt.show()

    '''
        KNN -> K-Nearest Neighbors Detector
        For an observation, its distance to its kth nearest neighbors could be viewed as the outlying scores
        Method: -Largest -Average -Median
    '''
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    X_train_pred = clf.labels_
    X_train_score = clf.decision_scores_

    score_pred = clf.decision_function(X_train)*-1
    y_pred = clf.predict(X_train)
    n_errors = (y_pred != y_train).sum()
    print('No of Errors:', clf_name, n_errors)

    # visualization
    xx, yy = np.meshgrid(np.linspace(-10, 10, 300), np.linspace(-10, 10, 300))
    threshold = stats.scoreatpercentile(score_pred, 100*outlier_fraction)
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)
    # fill blue colormap from minimum anomaly score to threshold value
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 10), cmap=plt.cm.Blues_r)
    a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
    plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
    b = plt.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',s=20, edgecolor='k')
    c = plt.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',s=20, edgecolor='k')
예제 #23
0
class TestKnn(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
예제 #24
0
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
              y_test_pred,
예제 #25
0
class TestKnn(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = KNN(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_prediction_labels_confidence(self):
        pred_labels, confidence = self.clf.predict(self.X_test,
                                                   return_confidence=True)
        assert_equal(pred_labels.shape, self.y_test.shape)
        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_prediction_proba_linear_confidence(self):
        pred_proba, confidence = self.clf.predict_proba(self.X_test,
                                                        method='linear',
                                                        return_confidence=True)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
예제 #26
0
class TestKnnMahalanobis(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        # calculate covariance for mahalanobis distance
        X_train_cov = np.cov(self.X_train, rowvar=False)

        self.clf = KNN(algorithm='auto',
                       metric='mahalanobis',
                       metric_params={'V': X_train_cov})
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
예제 #27
0
def get_out_score(X, observers):

	print('>> internal check : r = ', observers.shape[0]/X.shape[0]*100)
	clf = KNN()
	clf.fit(observers)
	return clf.decision_function(X)
# # kNN

# In[3]:

#kNN
clf_name = 'kNN'
clf = KNN(method='median')

# In[4]:

#用训练集训练
clf.fit(X_train)
y_train_pred = clf.labels_
y_train_scores = clf.decision_scores_
y_test_pred = clf.predict(X_test)
y_test_scores = clf.decision_function(X_test)
#评价性能
roc_train = round(roc_auc_score(y_train, y_train_scores), 4)
prn_train = round(precision_n_scores(y_train, y_train_scores), ndigits=4)
roc_test = round(roc_auc_score(y_test, y_test_scores), 4)
prn_test = round(precision_n_scores(y_test, y_test_scores), ndigits=4)

# In[5]:

#输出计算得到的roc_auc和precision @ rank n
print("\nOn Train Data:")
print(clf_name, 'roc:', roc_train, 'precision @ rank n:', prn_train)
print("\nOn Test Data:")
print(clf_name, 'roc:', roc_test, 'precision @ rank n:', prn_test)

# In[6]:
예제 #29
0
Hence, we use a library called **pyod** which hosts a number of outlier detection algorithms.
"""

pip install pyod

"""### KNN Classifier  (Proximity-Based)"""

from pyod.models.knn import KNN   # kNN detector

# train kNN detector
clf_name = 'KNN'
clf = KNN()
clf.fit(X)

y_pred = clf.predict(X)  # outlier labels (0 or 1)
y_scores = clf.decision_function(X)  # outlier scores

y_pred

"""0 means normal value while 1 means anomalous value."""

colors = np.array(['#377eb8', '#ff7f00'])
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred - 1) // 2])

"""Finding the ROC Accuracy score for the prediction label."""

clf.fit_predict_score(X[:, 0].reshape(-1,1), y_pred, scoring='roc_auc_score')

"""### Angle-based Outlier Detector (Probabilistic Based Model)"""

from pyod.models import abod
예제 #30
0
    #划分测试集和训练集
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

    #使用pyod中的KNN算法拟合数据
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores,The outlier scores of the training data.

    #预测样本是不是离群点,返回0和1 的数组
    y_test_pred = clf.predict(X_test)

    y_test_scores = clf.decision_function(
        X_test)  # outlier scores,The anomaly score of the input samples.
    #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积
    try:
        sumAuc_train += sklearn.metrics.roc_auc_score(y_train,
                                                      y_train_scores,
                                                      average='macro')
        sumAuc_test += sklearn.metrics.roc_auc_score(y_test,
                                                     y_test_scores,
                                                     average='macro')
        #s=precision_score(y_train, y_train_scores, average='macro')
        i += 1
        print(sumAuc_train, sumAuc_test)
    except ValueError:
        pass

    #得到ROC值和精确度 prn
예제 #31
0
    k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
              150, 160, 170, 180, 190, 200]

    train_scores = np.zeros([X_train.shape[0], n_clf])
    test_scores = np.zeros([X_test.shape[0], n_clf])

    print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))

    for i in range(n_clf):
        k = k_list[i]

        clf = KNN(n_neighbors=k, method='largest')
        clf.fit(X_train_norm)

        train_scores[:, i] = clf.decision_scores_
        test_scores[:, i] = clf.decision_function(X_test_norm)

    # Decision scores have to be normalized before combination
    train_scores_norm, test_scores_norm = standardizer(train_scores,
                                                       test_scores)
    # Combination by average
    y_by_average = average(test_scores_norm)
    evaluate_print('Combination by Average', y_test, y_by_average)

    # Combination by max
    y_by_maximization = maximization(test_scores_norm)
    evaluate_print('Combination by Maximization', y_test, y_by_maximization)

    # Combination by aom
    y_by_aom = aom(test_scores_norm, n_buckets=5)
    evaluate_print('Combination by AOM', y_test, y_by_aom)
예제 #32
0
    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
              y_test_pred, show_figure=True, save_figure=True)
예제 #33
0
    k_list = [
        10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
        170, 180, 190, 200
    ]

    train_scores = np.zeros([X_train.shape[0], n_clf])
    test_scores = np.zeros([X_test.shape[0], n_clf])

    for i in range(n_clf):
        k = k_list[i]

        clf = KNN(n_neighbors=k, method='largest')
        clf.fit(X_train_norm)

        train_scores[:, i] = clf.decision_scores_
        test_scores[:, i] = clf.decision_function(X_test_norm)

    # decision scores have to be normalized before combination
    train_scores_norm, test_scores_norm = standardizer(train_scores,
                                                       test_scores)
    # combination by average
    y_by_average = average(test_scores_norm)
    evaluate_print('Combination by Average', y_test, y_by_average)

    # combination by max
    y_by_maximization = maximization(test_scores_norm)
    evaluate_print('Combination by Maximization', y_test, y_by_maximization)

    # combination by aom
    y_by_aom = aom(test_scores_norm, n_buckets=5)
    evaluate_print('Combination by AOM', y_test, y_by_aom)
예제 #34
0
# In[2]:


# train kNN detector
clf_name = 'KNN'
clf = KNN()
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores


# In[3]:


# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)


# In[4]: