示例#1
0
def obj_func_kNN(params):
    ## objective function used in baseian optimization
    outlier_fraction = params[0]
    n_neighbors = params[1]
    method = params[2]
    radius = params[3]

    # load data set to function work space
    Y_train = np.load('Y_train.npy')
    X_train = np.load('X_train.npy')

    # create model
    clf = KNN(contamination=outlier_fraction,
              n_neighbors=n_neighbors,
              method=method,
              radius=radius)
    # fit the dataset to the model
    clf.fit(X_train)

    scores_pred = clf.decision_function(
        X_train) * -1  # predict raw anomaly score
    Rprecision = Rprecision_f(Y_train, scores_pred)
    if glb_verbose:
        print('R Precision : ', Rprecision)

    y_pred = clf.predict(
        X_train)  # prediction of a datapoint category outlier or inlier
    objVal = objVal_f(Rprecision, y_pred, Y_train)

    return objVal
    def some_random_test():
        np.set_printoptions(threshold=sys.maxsize)

        X = load_npz("X.npz").toarray()
        Y = genfromtxt('Y.csv', delimiter=',')

        # train kNN detector
        clf_name = 'KNN'
        clf = KNN()

        # find outliers per class
        # print(Y.shape)
        # print(X[Y == 1.].shape)
        # print(X[Y == 0.].shape)
        # print(X[Y == 7.].shape)

        # collect the outliers in a per class manner
        classList = [1.0, 0.0, 7.0]
        y_train_pred_total = []
        for clas in classList:
            clf.fit(X[Y == clas])
            y_train_pred_total.append(clf.labels_)

        # -------------------------RESULT---------------------
        # 0:inlier, 1: outlier
        np.array(y_train_pred_total).tofile('outliers.csv',
                                            sep=',',
                                            format='%10.5f')
示例#3
0
class TestKnnMedian(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')

    def test_fit(self):
        self.clf.fit(self.X_train)

    def test_decision_function(self):
        self.clf.fit(self.X_train)
        self.clf.decision_function(self.X_train)
        self.clf.decision_function(self.X_test)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def tearDown(self):
        pass
示例#4
0
def removeOutliers(df_flights_list,
                   contamination=0.001,
                   n_neighbors=1000,
                   method='mean'):
    '''Remove Outliers'''

    lf_array = []
    for flights in df_flights_list:
        lf_array.append(flights.lf.values)
    lf_array = np.array(lf_array)

    # Train kNN detector
    outlier_model = KNN(contamination=contamination,
                        n_neighbors=n_neighbors,
                        method=method)
    outlier_model.fit(lf_array)

    # Get the prediction labels
    outliers_labels = outlier_model.labels_  # binary labels (0: inliers, 1: outliers)

    df_flights_list = [
        df_flight for index, df_flight in enumerate(df_flights_list)
        if outliers_labels[index] == 0
    ]

    return df_flights_list
示例#5
0
 def distanceBased(self):
     '''
     @brief Function that implements the distance based component
     @param self
     @return It returns the vector with the scores of the instances
     '''
     # Initialize the scores
     scores = np.array([0] * len(self.dataset)).astype(float)
     for i in range(self.num_iter):
         knn = KNN(n_neighbors=5, contamination=self.contamination)
         # Number in the interval [50, 1000]
         subsample_size = np.random.randint(50, 1001)
         sample = []
         if subsample_size >= len(self.dataset):
             sample = list(range(len(self.dataset)))
         else:
             # Take the sample and train the model
             sample = np.random.choice(len(self.dataset),
                                       size=subsample_size,
                                       replace=False)
         knn.fit(self.dataset[sample])
         # Update the score to compute the mean
         scores[sample] += knn.decision_scores_
     # Return the mean
     scores = scores / self.num_iter
     scores = scale(scores)
     return scores
示例#6
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
示例#7
0
def training(data, img_shape, re_sample_type, text_len, permission_names,
             extract_f):
    # load training data
    print('preparing training data')
    inputs, permissions = prepare_training_data(data, img_shape,
                                                re_sample_type, text_len,
                                                permission_names)

    # get features
    print('generating training features')
    features = extract_f.predict(inputs)

    # train auto encoder model, knn model
    print('training outlier model + knn model')
    detectors = []
    knn_trees = []
    features_in_permissions = [
    ]  # features in each permission, [permission_id, feature_id]
    for p in permission_names:
        print('training', p, '...')
        features_current = []
        for i in range(len(permissions)):
            if p in permissions[i]:
                features_current.append(features[i])
        features_in_permissions.append(features_current)

        detector = AutoEncoder(epochs=200, verbose=0)
        detector.fit(features_current)
        detectors.append(detector)

        knn = KNN()
        knn.fit(features_current)
        knn_trees.append(knn)

    return detectors, knn_trees, features_in_permissions
示例#8
0
def remove_outliers_knn(
        x: pd.DataFrame,
        y: np.array,
        contamination: float = 0.1) -> Tuple[pd.DataFrame, np.array]:
    """Remove outliers from the training/test set using PyOD's KNN classifier

    Args:
        x: DataFrame containing the X's
        y: target array
        contamination: the amount of contamination of the data set

    Returns:
        x and y with outliers removed
    """
    clf = KNN(contamination=contamination, n_jobs=-1)

    clf.fit(x)

    labels = clf.labels_

    print(
        "{0:.2%} among {1:,} sample points are identified and removed as outliers"
        .format(sum(labels) / x.shape[0], x.shape[0]))

    x = x.iloc[labels == 0]
    y = y[labels == 0]

    return x, y
示例#9
0
class TestKnnMedian(unittest.TestCase):

    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')

    def test_fit(self):
        self.clf.fit(self.X_train)

    def test_decision_function(self):
        self.clf.fit(self.X_train)
        self.clf.decision_function(self.X_train)
        self.clf.decision_function(self.X_test)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def tearDown(self):
        pass
示例#10
0
    def get_all_readings_from_person(self,
                                     person_tag,
                                     remove_outliers=0,
                                     additional_where=""):
        #Debug.print_debug(self.file_path)
        print(self.file_path)
        dataset = sqlite3.connect(self.file_path)
        if len(additional_where) > 0:
            to_return = self.get_data_sql_query(
                "select {} from {} where {} like {} {}".format(
                    ', '.join(self.features), self.table_name,
                    self.person_column, person_tag, additional_where), dataset)
        else:
            to_return = self.get_data_sql_query(
                "select {} from {} where {} like '{}'".format(
                    ', '.join(self.features), self.table_name,
                    self.person_column, person_tag), dataset)
        self.data = to_return
        if (remove_outliers > 0):
            knn = KNN(contamination=remove_outliers)
            to_return_aux = to_return.copy()
            to_return_aux = to_return_aux.drop(self.label_tag, 1)
            knn.fit(to_return_aux)
            pred = knn.predict(to_return_aux)
            to_return = to_return.iloc[np.where(pred == 0)[0], :]

        return to_return
示例#11
0
def run_KNN_base_detector(data, k, metric='euclidean', p=2, method='mean'):
    """
    Function to fit and predict the KNN base detector on `data`.
    
    Input:
     - data: pd.DataFrame, to run KNN on
     - k: integer, parameter to indicate the amount of neighbours to include in relative density determination
     - metric: string, distance metric to use, default `euclidean`
     - p: int, default 2 since metric = `euclidean`, otherwise set according to distance metric
     
    Output:
     - clf of class pyod.models.knn.KNN with all its properties
    """
    
    # Split data in values and targets: some datasets have an ID column, others don't
    try:
        X = data.drop(['outlier', 'id'], axis=1)
    except KeyError:
        X = data.drop('outlier', axis=1)
    
    # Construct and fit classifier
    clf = KNN(n_neighbors=k, metric='euclidean', p=p, method=method)
    clf.fit(X) # Fit only on features
    
    # Add ground truth labels for evaluation of the classifier
    clf.true_labels_ = data['outlier']
    
    # Return the classifier for further processing
    return clf
示例#12
0
  def detectarOutlierKNN(self, idmodelo, Xtodos, corteOutlier):
    # Detecao Outliers 1--------------------------------------------------------------
    clf = KNN()
    clf.fit(Xtodos)

    # get outlier scores
    y_train_scores = clf.decision_scores_  # raw outlier scores
    y_test_scores = clf.decision_function(Xtodos)  # outlier scores

    YCodigoTodosComOutilier = self.selectMatrizY(idmodelo, "ID", "TODOS")

    cont = 0
    amostrasRemovidas = 0

    for itemOutilier in y_train_scores:
      if itemOutilier > corteOutlier:
        contTodos = 0
        for item in YCodigoTodosComOutilier:
          amostra = str(item)
          amostra = amostra.replace("[", "")
          amostra = amostra.replace("]", "")
          if contTodos == cont:
            db.execute(
              " update amostra set tpamostra = 'OUTLIER' where idamostra = " + str(amostra) + " and idmodelo = " + str(
                idmodelo) + "")
            print(itemOutilier)
            amostrasRemovidas = amostrasRemovidas + 1
            break
          contTodos = contTodos + 1
      cont = cont + 1

    session.commit()
    print("Numero de Amostras Removidas: " + str(amostrasRemovidas))
    return cont
示例#13
0
def knn(X_train, y_train=None, X_test=None, y_test=None):
    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores
    # # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    #
    # # evaluate and print the results
    # print("\nOn Training Data:")
    # evaluate_print(clf_name, y_train, y_train_scores)
    # print("\nOn Test Data:")
    # evaluate_print(clf_name, y_test, y_test_scores)
    #
    # visualize the results
    visualize(clf_name,
              X_train,
              X_test,
              y_train_pred,
              y_test_pred,
              show_figure=True,
              save_figure=False)

    return y_train_pred, y_train_scores
示例#14
0
文件: views.py 项目: richuln6/MASK-D
def pyodtry():
    dfwhole = df_en_all
    df = dff2
    X1 = reduce(dfwhole)
    X2 = reduce(df)
    ddf = pd.read_pickle('LogFileDfs/original')

    random_state = np.random.RandomState(42)
    outliers_fraction = 0.005
    clf = KNN(method='mean', contamination=outliers_fraction)
    xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200))

    clf.fit(X1)
    scores_pred = clf.decision_function(X2) * -1
    y_pred = clf.predict(X2)
    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)
    print('OUTLIERS : ', n_outliers, 'INLIERS : ', n_inliers)
    #dfx = pdf
    #dfx['outlier'] = y_pred.tolist()
    df['authenticated?'] = y_pred.tolist()
    ddf['authenticated?'] = df['authenticated?']
    output = ddf[ddf['authenticated?'] == 1]
    # create sqlalchemy engine
    #engine = create_engine("mysql+pymysql://{user}:{pw}@172.17.0.3/{db}".format(user="******",pw="richul123",db="emss"))
    # Insert whole DataFrame into  MySQL
    #output.to_sql('output', con = engine, if_exists = 'replace', chunksize = 1000)
    with pd.ExcelWriter(
            '/home/richul/Documents/EnhancingMailServerSecurity/Output/output.xlsx'
    ) as writer:
        output.to_excel(writer, sheet_name='output')
示例#15
0
def pyod_train(clf, name):
    """
    :param clf:     分类器
    :param name:    算法名称
    :return:
    """
    x_train, df_train = get_train_data()

    if name == "KNN_MAH":
        x_train_cov = np.cov(x_train, rowvar=False)
        clf = KNN(metric='mahalanobis', metric_params={'V': x_train_cov})

    print("————————————{} training————————————".format(name))
    time0 = datetime.datetime.now()

    clf.fit(x_train)

    print("———————{} finished training————————".format(name))
    time1 = datetime.datetime.now()
    total_time = (time1 - time0).seconds / 3600.0
    print("Total time spent:", total_time)

    if name in S_models:
        with open('M:\mh_data\model\{}\{}.pkl'.format(name, name), 'wb') as f:
        # with open('/home/deng/M/mh_data/model/{}/{}.pkl'.format(name, name), 'wb') as f:
            pickle.dump(clf, f)
    elif name in K_models:
        clf.save("M:\mh_data\model\{}\{}".format(name, name))
        # clf.save("/home/deng/M/mh_data/model/{}/{}".format(name, name))
    else:
        return clf
示例#16
0
class TestKnnMedian(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')

    def test_fit(self):
        self.clf.fit(self.X_train)

    def test_decision_function(self):
        self.clf.fit(self.X_train)
        self.clf.decision_function(self.X_train)
        self.clf.decision_function(self.X_test)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
示例#17
0
def stop_train(filename):
    """
    Stops training and saves the model as filename.sav
    also saves the threshold, mean and standard deviation
    in a json file of the same name. Also saves the pca model
    """
    pca = PCA(n_components=3)
    pca.fit(np.array(train.arr))
    with open(filename + 'pca.sav', 'wb') as savpca:
        pickle.dump(pca, savpca)
    z = find_theta_score(np.array(train.arr), pca)

    lof = KNN(n_neighbors=1)
    lof.fit(z)
    scores = lof.decision_scores_
    with open(filename + 'knn.sav', 'wb') as savknn:
        pickle.dump(lof, savknn)

    mean = scores.mean()
    stdev = scores.std()
    thres = mean + 18 * stdev
    params = {}
    params['mean'] = mean
    params['std'] = stdev
    params['threshold'] = thres
    with open(filename + '.json', 'w') as jsonf:
        json.dump(params, jsonf)

    print()
    print("Training Completed")
示例#18
0
class IForestSupervisedKNN(BaseDetector):
    def __init__(self, get_top=0.8, if_params={}, knn_params={}):
        super(IForestSupervisedKNN, self).__init__()
        self.get_top = get_top
        self.is_fitted = False

        self.iforest = IForest(**if_params)

        self.knn = KNN(**knn_params)

    def fit(self, X, y=None):

        X = check_array(X)
        self._set_n_classes(y)

        self.iforest.fit(X)

        scores = self.iforest.predict_proba(X)[:, 1]

        normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]]

        self.knn.fit(normal_instances)

        self.decision_scores_ = self.decision_function(X)
        self._process_decision_scores()

        self.is_fitted = True

        return self

    def decision_function(self, X):

        check_is_fitted(self, ['is_fitted'])

        return self.knn.decision_function(X)
示例#19
0
def api_alert(influxdb_ip, influxdb_port, influxdb_user, influxdb_pwd,
              influxdb_database, influxdb_table, apiid):

    timelimit = 'time > now()-1d'
    # 访问influxdb
    client = InfluxDBClient(influxdb_ip, influxdb_port, influxdb_user,
                            influxdb_pwd, influxdb_database)
    # 获取当前API一天前的数据
    result = client.query('select Average, CallCount, ErrorRate from ' +
                          influxdb_table + ' where ApiId = \'' + apiid +
                          '\' and ' + timelimit + ';')
    # 把resultset格式的数据转换成list格式
    apis_table = list(result.get_points(measurement='apis'))
    # 把要处理的数据存成DataFrame
    df = pd.DataFrame(data=apis_table)
    # 去掉不参与运算的列,取训练集x
    x = df
    x = x.drop("time", axis=1)
    # 数据处理一下,归一化,映射到[0,1]
    x['CallCount'] = (x['CallCount']-x['CallCount'].min()) / \
        (x['CallCount'].max()-x['CallCount'].min())
    x['Average'] = (x['Average']-x['Average'].min()) / \
        (x['Average'].max()-x['Average'].min())
    x['ErrorRate'] = x['ErrorRate'] / 100
    # 取最后十秒的数据点作为测试点
    x_last = x.tail(1)
    #df_last = df.tail(1)
    x = x.drop(x.index[-1])
    df = df.drop(df.index[-1])
    # 转换成numpy格式准备计算
    x = x.values

    # 训练一个kNN检测器
    clf_name = 'kNN'
    clf = KNN()  # 初始化检测器clf
    clf.fit(x)  # 使用X_train训练检测器clf

    # 给df添加一列显示异常分数
    df['score'] = clf.decision_scores_

    # 排序分数
    df = df.sort_values("score", ascending=False)
    #print(df.head(20))

    # 新数据预测
    test_data = x_last
    test_scores = clf.decision_function(test_data)

    if (test_scores > 0.8):
        print('数据点异常程度4,必须报警')
    elif (test_scores > 0.5):
        print('数据点异常程度3,需要报警')
    elif (test_scores > 0.1):
        print('数据点异常程度2,建议报警')
    elif (test_scores > 0.05):
        print('数据点异常程度1,可以报警')
        #这个分级是根据KNN.py的图像分析出来的,0.05以上的很明显是异常点,0.1以上已经出现了离群现象,0.5以上就距离数据点很远了。
        #这个值根据训练用的时间相关,一天的数据0.05比较合适。
    return test_scores
示例#20
0
def median_knn(X_train, X_test, Y_train, Y_test):
    from pyod.models.knn import KNN
    model = KNN(method='median')
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
def train_monitoring_model(data):
    logger.info("Training a monitoring model")

    X_train, X_test = train_test_split(np.array(data, dtype='float'),
                                       test_size=0.2)
    monitoring_model = KNN(contamination=0.05, n_neighbors=15, p=5)
    monitoring_model.fit(X_train)
    return monitoring_model
示例#22
0
 def knnAD(self):
     clf_name = 'KNN'
     clf = KNN()
     clf.fit(self.X)
     # get the prediction labels and outlier scores of the training data
     y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
     y_scores = clf.decision_scores_  # raw outlier scores
     generateAnomalis(self.data, self.label, y_pred)
示例#23
0
def detect_anomaly(df):
	clf = KNN()
	x_values = df.change.values.reshape(df.index.values.shape[0],1)
	y_values = df.change.values.reshape(df.change.values.shape[0],1)
	clf.fit(y_values)
	clf.predict(y_values)
	df["out_label"] = clf.predict(y_values)  #fit_predict_score
	df["out_score"] = clf.decision_function(y_values)
	return df
示例#24
0
def detect_anomaly(df):
	x_values = df.index.values.reshape(df.index.values.shape[0],1)
	y_values = df.change.values.reshape(df.change.values.shape[0],1)
	clf = KNN()
	clf.fit(y_values)
	clf.predict(y_values)
	df["label_knn"] = clf.predict(y_values)
	df["score_knn"] = clf.decision_function(y_values).round(4)
	return df
def abnormal_KNN(train_npy, test_npy):
    clf_name = 'kNN'
    clf = KNN()
    train_npy = np.array(train_npy).reshape(-1, 1)
    clf.fit(train_npy)

    test_npy = np.array(test_npy).reshape(-1, 1)
    y_test_pred = clf.predict(test_npy)
    y_test_scores = clf.decision_function(test_npy)
    return y_test_pred
def outliers(base):
    detector = KNN()
    detector.fit(base)
    previsoes = detector.labels_
    outliers = []
    for i in range(len(previsoes)):
        if previsoes[i] == 1:
            outliers.append(i)
    base = base.drop(base.index[outliers])
    return base
示例#27
0
    def S2(self):

        self.S1()
        water_data = self.water_data
        result = self.result

        # 数据预处理及模型训练
        clean_data = water_data[water_data['S1'] == 0]
        Y = pd.DataFrame(index=clean_data.index, columns=['S2'])

        X_train = np.array(clean_data.iloc[:, 1:12])
        name = list(clean_data.iloc[:, 1:12].columns.values)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True)
        clf2 = KNN(contamination=0.05, n_neighbors=100)
        clf3 = HBOS(contamination=0.05, n_bins=10)
        clf4 = PCA(contamination=0.05)

        clf1.fit(X_train)
        clf2.fit(X_train)
        clf3.fit(X_train)
        clf4.fit(X_train)

        Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_
        water_data = pd.concat([water_data, Y], axis=1)
        # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0;

        result['统计异常'] = water_data['S2'].values

        # 寻找异常维度
        from sklearn.neighbors import KernelDensity
        clean_data = water_data[water_data['S1'] == 0]
        dens = pd.DataFrame(index=clean_data.index,
                            columns=[
                                'temperature', 'pH', 'EC', 'ORP', 'DO',
                                'turbidity', 'transparency', 'COD', 'P',
                                'NH3N', 'flux'
                            ])

        for i in dens.columns:
            kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(
                clean_data[i].values.reshape(-1, 1))
            dens[i] = np.exp(
                kde.score_samples(clean_data[i].values.reshape(-1, 1)))
        dens = dens.iloc[:, 0:11].rank()
        dens['S2_names'] = dens.idxmin(axis=1)
        water_data = pd.concat([water_data, dens['S2_names']], axis=1)
        self.water_data = water_data
        result['统计异常维度'] = water_data['S2_names'].values

        # 存储模型
        joblib.dump(scaler, "./water_model/S2_scaler")
        joblib.dump(clf1, "./water_model/S2_Iforest")
示例#28
0
def train_and_save_model(training_folder, feature_star_regex, model_filename):
    clf_name = 'KNN'
    clf = KNN()
    print("Getting data from " + training_folder)
    X_train, X_messages = feature_utils.get_data(training_folder,
                                                 feature_star_regex, False)
    print("Got data ")

    clf.fit(X_train)
    print("Completed fitting data")

    dump(clf, model_filename)
示例#29
0
def detect_outliers_KNN(df):
    ''' Returns the outlier scores using K-Nearest Neighbor

    Parameters:
    -----------
    df: pd.DataFrame,
    '''
    clf = KNN(contamination=0.2)
    clf.fit(df)
    outlier_score = clf.decision_scores_
    # df_result = pd.DataFrame(outlier_pred, columns=['outlier_pred'])
    return outlier_score * -1
def getOutlierKNN(dataset):
    '''
    @brief Function that executes KNN algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    knn = KNN()
    # Fits the data and obtains labels
    knn.fit(dataset)
    # Return labels
    return knn.labels_
示例#31
0
    def get_outlier_points(self):

        clf_name = 'KNN'
        clf = KNN()
        clf.fit(self.nvda_BS)
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_
        outliers = []
        for i in range(len(y_train_pred)):
            if y_train_pred[i] == 1:
                outliers.append((self.nvda_BS.iloc[i].to_dict(),
                                 self.nvda_BS.iloc[i].name))

        return outliers
示例#32
0
class TestKnn(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
示例#33
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
示例#34
0
    n_clf = 20  # number of base detectors

    # Initialize 20 base detectors for combination
    k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
              150, 160, 170, 180, 190, 200]

    train_scores = np.zeros([X_train.shape[0], n_clf])
    test_scores = np.zeros([X_test.shape[0], n_clf])

    print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))

    for i in range(n_clf):
        k = k_list[i]

        clf = KNN(n_neighbors=k, method='largest')
        clf.fit(X_train_norm)

        train_scores[:, i] = clf.decision_scores_
        test_scores[:, i] = clf.decision_function(X_test_norm)

    # Decision scores have to be normalized before combination
    train_scores_norm, test_scores_norm = standardizer(train_scores,
                                                       test_scores)
    # Combination by average
    y_by_average = average(test_scores_norm)
    evaluate_print('Combination by Average', y_test, y_by_average)

    # Combination by max
    y_by_maximization = maximization(test_scores_norm)
    evaluate_print('Combination by Maximization', y_test, y_by_maximization)